diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index ef03b381f13..71963c1f725 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -293,7 +293,6 @@ tf_cuda_cc_test(
         "//conditions:default": [],
     }),
     tags = [
-        "no_oss",  # http://b/119522529
         "noasan",
     ],
     # We must ensure that the dependencies can be dynamically linked since
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 23913a6cdf0..21d72ac96b5 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -30,8 +30,8 @@ limitations under the License.
 #include "tensorflow/cc/ops/while_loop.h"
 #include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/core/distributed_runtime/server_lib.h"
+#include "tensorflow/core/framework/logging.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
-#include "tensorflow/core/kernels/logging_ops.h"
 #endif  // !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
diff --git a/tensorflow/compiler/aot/tests/BUILD b/tensorflow/compiler/aot/tests/BUILD
index ce8dae42629..6362470abef 100644
--- a/tensorflow/compiler/aot/tests/BUILD
+++ b/tensorflow/compiler/aot/tests/BUILD
@@ -36,6 +36,7 @@ py_binary(
     name = "make_test_graphs",
     testonly = 1,
     srcs = ["make_test_graphs.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/core:protos_all_py",
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index ef91c85ec36..3a8e9fd4fd3 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -220,6 +220,7 @@ cc_library(
     name = "shape_inference_helpers",
     srcs = ["shape_inference_helpers.cc"],
     hdrs = ["shape_inference_helpers.h"],
+    visibility = [":friends"],
     deps = ["//tensorflow/core:graph"],
 )
 
@@ -262,7 +263,6 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -270,6 +270,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/memory",
@@ -466,6 +467,9 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
     ],
diff --git a/tensorflow/compiler/jit/compilability_check_util.cc b/tensorflow/compiler/jit/compilability_check_util.cc
index 6f60c26e30f..8621c4374b7 100644
--- a/tensorflow/compiler/jit/compilability_check_util.cc
+++ b/tensorflow/compiler/jit/compilability_check_util.cc
@@ -165,6 +165,18 @@ bool LogNotCompilableAndReturn(const Node& node,
   return false;
 }
 
+bool RecursiveCompilabilityChecker::OpIsInaccurate(const Node& node) {
+  // b/127344411: SelfAdjointEigV2 and Svd precision issues.
+  return node.type_string() == "SelfAdjointEigV2" ||
+         node.type_string() == "Svd";
+}
+
+bool RecursiveCompilabilityChecker::OpIsSlow(const Node& node) {
+  // b/128001705: SelfAdjointEigV2 and Svd performance issues.
+  return node.type_string() == "SelfAdjointEigV2" ||
+         node.type_string() == "Svd" || node.type_string() == "Qr";
+}
+
 bool RecursiveCompilabilityChecker::IsCompilableNode(
     const Node& node, int depth, FunctionLibraryRuntime* lib_runtime) {
   // _Arg nodes in a top-level function represent feeds and _Retval nodes in a
@@ -228,8 +240,12 @@ bool RecursiveCompilabilityChecker::IsCompilableNode(
                                      "resource variable op in called function");
   }
 
-  if (!op_filter_.allow_svd_op && node.type_string() == "Svd") {
-    return LogNotCompilableAndReturn(node, "Svd ops disabled");
+  if (!op_filter_.allow_slow_and_inaccurate_ops && OpIsInaccurate(node)) {
+    return LogNotCompilableAndReturn(node, "operation with correctness issues");
+  }
+
+  if (!op_filter_.allow_slow_and_inaccurate_ops && OpIsSlow(node)) {
+    return LogNotCompilableAndReturn(node, "slow operation");
   }
 
   return true;
@@ -248,7 +264,8 @@ RecursiveCompilabilityChecker::OperationFilter CreateOperationFilter(
       registration.elide_assert_and_checknumerics;
   op_filter.allow_ops_producing_or_consuming_variant =
       registration.cluster_variant_ops;
-  op_filter.allow_svd_op = registration.cluster_svd_op;
+  op_filter.allow_slow_and_inaccurate_ops =
+      registration.cluster_slow_and_inaccurate_ops;
   return op_filter;
 }
 
diff --git a/tensorflow/compiler/jit/compilability_check_util.h b/tensorflow/compiler/jit/compilability_check_util.h
index 7a2cac8cd9d..0ef42d66821 100644
--- a/tensorflow/compiler/jit/compilability_check_util.h
+++ b/tensorflow/compiler/jit/compilability_check_util.h
@@ -97,10 +97,9 @@ class RecursiveCompilabilityChecker {
     // live-out DT_VARIANT values.
     bool allow_ops_producing_or_consuming_variant;
 
-    // Whether the "Svd" op should be auto-clustered.  The XLA implemenation of
-    // this op has some performance (b/128001705) and possibly correctness
-    // (b/127344411) issues so we avoid auto-clustering it.
-    bool allow_svd_op;
+    // Whether ops known to be slow or to have correctness issues should be
+    // auto-clustered.
+    bool allow_slow_and_inaccurate_ops;
   };
 
   RecursiveCompilabilityChecker(const OperationFilter* op_filter,
@@ -119,6 +118,11 @@ class RecursiveCompilabilityChecker {
     return IsCompilableCall(call_def, /*depth=*/0, lib_runtime);
   }
 
+  // Returns true if XLA supports this Op, but we don't want to cluster it (ie:
+  // due to performance or correctness concerns).
+  bool OpIsInaccurate(const Node& node);
+  bool OpIsSlow(const Node& node);
+
  private:
   bool IsCompilableNode(const Node& node, int depth,
                         FunctionLibraryRuntime* lib_runtime);
diff --git a/tensorflow/compiler/jit/deadness_analysis.cc b/tensorflow/compiler/jit/deadness_analysis.cc
index 53098cfa78e..0a92c06ad10 100644
--- a/tensorflow/compiler/jit/deadness_analysis.cc
+++ b/tensorflow/compiler/jit/deadness_analysis.cc
@@ -371,7 +371,8 @@ class PredicateFactory {
                              Predicate** predicate) {
     TensorId tensor_id(node->name(), output_idx);
 
-    bool is_boolean_tensor = node->output_type(tensor_id.index()) == DT_BOOL;
+    bool is_boolean_tensor =
+        BaseType(node->output_type(tensor_id.index())) == DT_BOOL;
     TF_RET_CHECK(!must_be_true || is_boolean_tensor);
 
     if (node->type_string() == "Const" && must_be_true) {
diff --git a/tensorflow/compiler/jit/deadness_analysis_test.cc b/tensorflow/compiler/jit/deadness_analysis_test.cc
index b879b861f4f..3a44eb7db75 100644
--- a/tensorflow/compiler/jit/deadness_analysis_test.cc
+++ b/tensorflow/compiler/jit/deadness_analysis_test.cc
@@ -1067,5 +1067,25 @@ TEST(DeadnessAnalysisTest, ConstantFalseSwitchCondition) {
   EXPECT_EQ(predicate_map[ControlOutputFor(id_true)], "#false");
 }
 
+TEST(DeadnessAnalysisTest, RefBoolSwitchCondition) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Output condition_ref_var =
+      ops::Variable(root.WithOpName("cond_ref"), TensorShape({}), DT_BOOL);
+  Output value = ops::Placeholder(root.WithOpName("value"), DT_FLOAT);
+  ops::Switch sw(root.WithOpName("switch"), value, condition_ref_var);
+
+  Output id_false = ops::Identity(root.WithOpName("id_false"), sw.output_false);
+  Output id_true = ops::Identity(root.WithOpName("id_true"), sw.output_true);
+
+  FixupSourceAndSinkEdges(root.graph());
+
+  PredicateMapTy predicate_map;
+  TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
+
+  EXPECT_EQ(predicate_map[ControlOutputFor(id_false)], "~*cond_ref:0");
+  EXPECT_EQ(predicate_map[ControlOutputFor(id_true)], "*cond_ref:0");
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/device_util.cc b/tensorflow/compiler/jit/device_util.cc
index b2dee129239..200e795a2e8 100644
--- a/tensorflow/compiler/jit/device_util.cc
+++ b/tensorflow/compiler/jit/device_util.cc
@@ -97,28 +97,19 @@ Status DeviceNameToDeviceType(const string& device, DeviceType* device_type) {
   return Status::OK();
 }
 
-Status PickDeviceForXlaImpl(const jit::DeviceInfoCache& device_info_cache,
-                            const jit::DeviceSet& devices,
-                            bool allow_mixing_unknown_and_cpu,
-                            bool* out_can_pick_device,
-                            absl::optional<jit::DeviceId>* out_device_picked) {
-  if (out_can_pick_device) {
-    *out_can_pick_device = true;
-  }
-
+xla::StatusOr<absl::optional<jit::DeviceId>> PickDeviceForXlaImpl(
+    const jit::DeviceInfoCache& device_info_cache,
+    const jit::DeviceSet& devices, bool allow_mixing_unknown_and_cpu,
+    bool failure_to_pick_is_error) {
 #define FAILED_TO_PICK_DEVICE(failing_status) \
   do {                                        \
-    if (out_can_pick_device) {                \
-      *out_can_pick_device = false;           \
-      return Status::OK();                    \
-    } else {                                  \
+    if (failure_to_pick_is_error) {           \
       return failing_status;                  \
+    } else {                                  \
+      return {absl::nullopt};                 \
     }                                         \
   } while (false)
 
-  TF_RET_CHECK(!devices.IsEmpty()) << "No devices to choose from";
-  DCHECK_NE(out_can_pick_device == nullptr, out_device_picked == nullptr);
-
   absl::optional<jit::DeviceId> maybe_gpu_device;
   absl::optional<jit::DeviceId> maybe_cpu_device;
   absl::optional<jit::DeviceId> maybe_unknown_device;
@@ -182,17 +173,15 @@ Status PickDeviceForXlaImpl(const jit::DeviceInfoCache& device_info_cache,
     }
   }
 
-  if (out_device_picked) {
-    if (maybe_gpu_device) {
-      *out_device_picked = *maybe_gpu_device;
-    } else if (maybe_unknown_device) {
-      *out_device_picked = *maybe_unknown_device;
-    } else {
-      *out_device_picked = *maybe_cpu_device;
-    }
+  if (maybe_gpu_device) {
+    return {*maybe_gpu_device};
+  } else if (maybe_unknown_device) {
+    return {*maybe_unknown_device};
+  } else if (maybe_cpu_device) {
+    return {*maybe_cpu_device};
   }
 
-  return Status::OK();
+  FAILED_TO_PICK_DEVICE(errors::Internal("Empty device set!"));
 
 #undef FAILED_TO_PICK_DEVICE
 }
@@ -200,21 +189,18 @@ Status PickDeviceForXlaImpl(const jit::DeviceInfoCache& device_info_cache,
 xla::StatusOr<jit::DeviceId> PickDeviceForXla(
     const jit::DeviceInfoCache& device_info_cache,
     const jit::DeviceSet& devices, bool allow_mixing_unknown_and_cpu) {
-  absl::optional<jit::DeviceId> device;
-  TF_RETURN_IF_ERROR(PickDeviceForXlaImpl(
-      device_info_cache, devices, allow_mixing_unknown_and_cpu,
-      /*out_can_pick_device=*/nullptr, &device));
-  return *device;
+  TF_ASSIGN_OR_RETURN(absl::optional<jit::DeviceId> device_id,
+                      PickDeviceForXlaImpl(device_info_cache, devices,
+                                           allow_mixing_unknown_and_cpu,
+                                           /*failure_to_pick_is_error=*/true));
+  return *device_id;
 }
 
-xla::StatusOr<bool> CanPickDeviceForXla(
+xla::StatusOr<absl::optional<jit::DeviceId>> MaybePickDeviceForXla(
     const jit::DeviceInfoCache& device_info_cache,
     const jit::DeviceSet& devices, bool allow_mixing_unknown_and_cpu) {
-  bool can_pick_device;
-  TF_RETURN_IF_ERROR(PickDeviceForXlaImpl(device_info_cache, devices,
-                                          allow_mixing_unknown_and_cpu,
-                                          &can_pick_device,
-                                          /*out_device_picked=*/nullptr));
-  return can_pick_device;
+  return PickDeviceForXlaImpl(device_info_cache, devices,
+                              allow_mixing_unknown_and_cpu,
+                              /*failure_to_pick_is_error=*/false);
 }
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/device_util.h b/tensorflow/compiler/jit/device_util.h
index f3c6dec687e..f26a565ff12 100644
--- a/tensorflow/compiler/jit/device_util.h
+++ b/tensorflow/compiler/jit/device_util.h
@@ -71,17 +71,34 @@ class DeviceSet {
     // iterator if this ends up being used widely.
     for (int word_index = 0; word_index < storage_.size(); word_index++) {
       uint64 word = storage_[word_index];
-      for (int bit_index = 0; bit_index < kWordSize; bit_index++) {
-        if (word & (1ull << bit_index)) {
-          if (!func(DeviceId(word_index * kWordSize + bit_index))) {
-            return;
-          }
+      while (word != 0) {
+        uint64 only_lowest_bit_set = word & -word;
+        // The number of trailing zeros in a non-zero word is the index of the
+        // least significant 1.
+        int bit_index = ctz_uint64(word);
+        if (!func(DeviceId(word_index * kWordSize + bit_index))) {
+          return;
         }
+        word ^= only_lowest_bit_set;
       }
     }
   }
 
  private:
+  static int ctz_uint64(uint64 x) {
+    DCHECK_NE(x, 0);
+#ifdef __GNUC__
+    return __builtin_ctzl(x);
+#else
+    int result = 0u;
+    while ((x & 1u) == 0u) {
+      x >>= 1;
+      ++result;
+    }
+    return result;
+#endif
+  }
+
   absl::InlinedVector<uint64, 1> storage_;
 
   const int kWordSize = 64;
@@ -181,9 +198,12 @@ xla::StatusOr<jit::DeviceId> PickDeviceForXla(
     const jit::DeviceInfoCache& device_info_cache,
     const jit::DeviceSet& devices, bool allow_mixing_unknown_and_cpu);
 
-// This is like `PickDeviceForXla` except that it returns false (instead of a
+// This is like `PickDeviceForXla` except that it returns nullopt (instead of a
 // non-OK Status) if no unambiguous choice of device exists.
-xla::StatusOr<bool> CanPickDeviceForXla(
+//
+// We return a failing Status for errors unrelated to the device choice
+// algorithm itself.
+xla::StatusOr<absl::optional<jit::DeviceId>> MaybePickDeviceForXla(
     const jit::DeviceInfoCache& device_info_cache,
     const jit::DeviceSet& devices, bool allow_mixing_unknown_and_cpu);
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index 261519de347..958b0a5f61c 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -537,8 +537,9 @@ Status Encapsulate(GraphDef* graphdef, FunctionDefLibrary* library,
                      XlaClusterInfo{func, func_name_attrs, xla_computation_node,
                                     std::map<string, int>{}});
   }
+  bool modified;
   s = ExtractOutsideCompilation("_encapsulate", "_outside", clusters,
-                                graph_out.get(), flr, lib_def.get());
+                                graph_out.get(), flr, lib_def.get(), &modified);
   if (!s.ok()) return s;
 
   GraphDef graphdef_out;
@@ -1105,7 +1106,9 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
             {"shapes", absl::Span<const DataType>({})},
             {"_outside_compilation_subgraph", "O2"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node"})}},
+             absl::Span<const string>(
+                 {"_xla_token_arg_node",
+                  "outside_compilation_O1_host_compute"})}},
            {"F"}},
           {{"outside_compilation_O1_host_compute"},
            "XlaHostCompute",
@@ -1985,7 +1988,9 @@ TEST(EncapsulateSubgraphsTest,
             {"shapes", absl::Span<const TensorShapeProto>({})},
             {"_outside_compilation_subgraph", "O2"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node"})}}},
+             absl::Span<const string>(
+                 {"_xla_token_arg_node",
+                  "outside_compilation_O1_host_compute"})}}},
       },
       {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
        {"h_0_retval_retval", "H:o:0"}});
@@ -2110,7 +2115,9 @@ TEST(EncapsulateSubgraphsTest,
             {"shapes", absl::Span<const TensorShapeProto>({})},
             {"_outside_compilation_subgraph", "O2"},
             {"_xla_token_input_nodes",
-             absl::Span<const string>({"_xla_token_arg_node"})}}},
+             absl::Span<const string>(
+                 {"_xla_token_arg_node",
+                  "outside_compilation_O1_host_compute"})}}},
           {{"outside_compilation_O1_host_compute"},
            "XlaHostCompute",
            {"D:o:0"},
@@ -2258,7 +2265,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
          {"shapes", absl::Span<const TensorShapeProto>({})},
          {"_outside_compilation_subgraph", "O2"},
          {"_xla_token_input_nodes",
-          absl::Span<const string>({"_xla_token_arg_node"})}},
+          absl::Span<const string>(
+              {"_xla_token_arg_node", "outside_compilation_O1_host_compute"})}},
         {}},
        {{"outside_compilation_O3_host_compute"},
         "XlaHostCompute",
@@ -2271,7 +2279,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
          {"shapes", absl::Span<const TensorShapeProto>({})},
          {"_outside_compilation_subgraph", "O3"},
          {"_xla_token_input_nodes",
-          absl::Span<const string>({"_xla_token_arg_node"})}},
+          absl::Span<const string>({"_xla_token_arg_node",
+                                    "outside_compilation_O1_host_compute",
+                                    "outside_compilation_O2_host_compute"})}},
         {}}},
       {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
        {"h_0_retval_retval", "H:o:0"}});
diff --git a/tensorflow/compiler/jit/encapsulate_util.cc b/tensorflow/compiler/jit/encapsulate_util.cc
index 2264806d6bd..ae0912c3f23 100644
--- a/tensorflow/compiler/jit/encapsulate_util.cc
+++ b/tensorflow/compiler/jit/encapsulate_util.cc
@@ -14,9 +14,12 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/jit/encapsulate_util.h"
+
 #include <algorithm>
 #include <iterator>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/jit/shape_inference.h"
@@ -24,6 +27,9 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+using stream_executor::port::StatusOr;
 
 namespace tensorflow {
 
@@ -333,6 +339,43 @@ Status PerformStaticShapeInferenceBeforeEncapsulation(Graph* g) {
   return Status::OK();
 }
 
+StatusOr<std::unique_ptr<absl::flat_hash_map<string, std::vector<string>>>>
+OutsideCompilationClusterDependencies(
+    const Graph* g, const string& outside_compilation_attr_name) {
+  auto cluster_deps = absl::make_unique<
+      absl::flat_hash_map<string, absl::flat_hash_set<string>>>();
+
+  for (const Edge* e : g->edges()) {
+    auto src_outside_compilation =
+        GetStringAttr(*e->src(), outside_compilation_attr_name);
+    auto dst_outside_compilation =
+        GetStringAttr(*e->dst(), outside_compilation_attr_name);
+
+    if (src_outside_compilation && dst_outside_compilation &&
+        *src_outside_compilation != *dst_outside_compilation) {
+      auto dst_deps_it = cluster_deps->find(*dst_outside_compilation);
+      if (dst_deps_it == cluster_deps->end()) {
+        cluster_deps->insert(std::make_pair(
+            *dst_outside_compilation,
+            absl::flat_hash_set<string>({*src_outside_compilation})));
+      } else {
+        dst_deps_it->second.insert(*src_outside_compilation);
+      }
+    }
+  }
+
+  auto cluster_deps_ordered =
+      absl::make_unique<absl::flat_hash_map<string, std::vector<string>>>();
+
+  for (auto it = cluster_deps->begin(); it != cluster_deps->end(); it++) {
+    std::vector<string> ordered_deps(it->second.begin(), it->second.end());
+    std::sort(ordered_deps.begin(), ordered_deps.end());
+    cluster_deps_ordered->insert(std::make_pair(it->first, ordered_deps));
+  }
+
+  return std::move(cluster_deps_ordered);
+}
+
 Status PreprocessEdgesBetweenOutsideCompilations(
     Graph* g, const string& outside_compilation_attr_name) {
   // Remove edges from source node to outside compilation nodes, and edges
diff --git a/tensorflow/compiler/jit/encapsulate_util.h b/tensorflow/compiler/jit/encapsulate_util.h
index c9f16d14168..c873c2a888c 100644
--- a/tensorflow/compiler/jit/encapsulate_util.h
+++ b/tensorflow/compiler/jit/encapsulate_util.h
@@ -19,7 +19,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_JIT_ENCAPSULATE_UTIL_H_
 #define TENSORFLOW_COMPILER_JIT_ENCAPSULATE_UTIL_H_
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
 
 namespace tensorflow {
 
@@ -89,6 +91,15 @@ struct XlaClusterInfo {
   const std::map<string, int> host_compute_core;
 };
 
+// Finds dependencies between outside compilation clusters, including both data
+// dependencies and control dependencies. cluster_deps maps the name name of an
+// outside compilation cluster to a set of names of outside compilation clusters
+// that it depends on.
+stream_executor::port::StatusOr<
+    std::unique_ptr<absl::flat_hash_map<string, std::vector<string>>>>
+OutsideCompilationClusterDependencies(
+    const Graph* g, const string& outside_compilation_attr_name);
+
 // Preprocesses edges within the same XLA cluster. It will perform the following
 // operations in order:
 //
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
index 5eda028a850..a6e66657fb5 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
@@ -15,12 +15,14 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/extract_outside_compilation_pass.h"
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 #include "tensorflow/compiler/jit/encapsulate_util.h"
 #include "tensorflow/compiler/tf2xla/side_effect_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
@@ -287,15 +289,20 @@ absl::optional<std::vector<PartialTensorShape>> GetInferredInputShapes(
   return results;
 }
 
+string host_compute_node_name(const string& original_oc_name) {
+  return absl::StrCat("outside_compilation_", original_oc_name,
+                      "_host_compute");
+}
+
 // Builds XlaHostCompute NodeDef from the outside compilation call node.
 xla::StatusOr<NodeDef> BuildXlaHostComputeNodeDef(
-    const Node* call_node, const std::map<string, int>& host_compute_core) {
+    const Node* call_node, const std::map<string, int>& host_compute_core,
+    const absl::flat_hash_map<string, std::vector<string>>& cluster_deps) {
   string original_oc_name;
   TF_RETURN_IF_ERROR(GetNodeAttr(
       call_node->attrs(), "_outside_compilation_subgraph", &original_oc_name));
-  NodeDefBuilder host_compute_builder(
-      absl::StrCat("outside_compilation_", original_oc_name, "_host_compute"),
-      "XlaHostCompute");
+  NodeDefBuilder host_compute_builder(host_compute_node_name(original_oc_name),
+                                      "XlaHostCompute");
 
   // Copy all attributes.
   for (auto attr : call_node->attrs()) {
@@ -309,9 +316,25 @@ xla::StatusOr<NodeDef> BuildXlaHostComputeNodeDef(
     host_compute_builder.Attr("tpu_core", core);
   }
 
-  // Set input tokens.
-  host_compute_builder.Attr(kXlaTokenInputNodesAttrName,
-                            std::vector<string>{kXlaTokenArgNodeName});
+  // Set input tokens and other outside compilation clusters that current
+  // cluster depends in `kXlaTokenArgNodeName`. This is needed because when
+  // outside compilation subgraphs are encapsulated and moved to host graph,
+  // control/data edges between them will only be reflected in host graph.
+  // From XLA's perspective, two originally dependent clusters are no longer
+  // connected, which makes them look like they can be scheduled for execution
+  // in arbitrary order even though in fact they must be executed in order
+  // according to their host-side graph dependency. This can cause deadlock.
+  // Therefore, we hint XLA what the correct ordering of these clusters should
+  // be to avoid deadlocks.
+  std::vector<string> xla_token_input_nodes;
+  xla_token_input_nodes.emplace_back(kXlaTokenArgNodeName);
+  auto cluster_deps_it = cluster_deps.find(original_oc_name);
+  if (cluster_deps_it != cluster_deps.end()) {
+    for (auto dep : cluster_deps_it->second) {
+      xla_token_input_nodes.emplace_back(host_compute_node_name(dep));
+    }
+  }
+  host_compute_builder.Attr(kXlaTokenInputNodesAttrName, xla_token_input_nodes);
 
   // Populate inputs.
   std::vector<DataType> input_dtypes;
@@ -371,7 +394,8 @@ Status ValidateOutsideCompilationCallNode(Node* call_node) {
 // If the function call node has no input/output edges, we will just remove it
 // and not create a XlaHostCompute node.
 Status ReplaceOrRemoveOutsideCompilationCallNode(
-    Graph* g, Node* call_node, const std::map<string, int>& host_compute_core) {
+    Graph* g, Node* call_node, const std::map<string, int>& host_compute_core,
+    const absl::flat_hash_map<string, std::vector<string>>& cluster_deps) {
   // If the function call node has no input/output edges, just remove it.
   bool has_edge = false;
   for (auto e : call_node->in_edges()) {
@@ -393,8 +417,9 @@ Status ReplaceOrRemoveOutsideCompilationCallNode(
   }
 
   // Build XlaHostCompute NodeDef.
-  TF_ASSIGN_OR_RETURN(NodeDef node_def,
-                      BuildXlaHostComputeNodeDef(call_node, host_compute_core));
+  TF_ASSIGN_OR_RETURN(
+      NodeDef node_def,
+      BuildXlaHostComputeNodeDef(call_node, host_compute_core, cluster_deps));
   TF_ASSIGN_OR_RETURN(Node * host_compute_node,
                       ReplaceNode(g, call_node, node_def));
   VLOG(4) << "Added HostCompute node: " << host_compute_node->DebugString();
@@ -1589,6 +1614,11 @@ Status ExtractOutsideCompilationForFunction(
   // We cannot early return here, because we might have outside compilation in
   // If/While function body.
 
+  // Find dependencies between outside compilation clusters.
+  TF_ASSIGN_OR_RETURN(auto cluster_deps,
+                      OutsideCompilationClusterDependencies(
+                          fbody->graph, outside_compilation_attr_name));
+
   // Preprocess edges between different outside compilations. They will be
   // restored in `ConstructHostGraph()`.
   TF_RETURN_IF_ERROR(PreprocessEdgesBetweenOutsideCompilations(
@@ -1643,7 +1673,7 @@ Status ExtractOutsideCompilationForFunction(
   for (Node* n : outside_compilation_nodes) {
     TF_RETURN_IF_ERROR(ValidateOutsideCompilationCallNode(n));
     TF_RETURN_IF_ERROR(ReplaceOrRemoveOutsideCompilationCallNode(
-        graph_out.get(), n, host_compute_core));
+        graph_out.get(), n, host_compute_core, *cluster_deps));
   }
 
   // Handle nodes with associated functions.
@@ -1691,11 +1721,13 @@ Status ExtractOutsideCompilation(
     const string& xla_cluster_attr_name,
     const string& outside_compilation_attr_name,
     const std::unordered_map<string, XlaClusterInfo>& clusters, Graph* g,
-    FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld) {
+    FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld,
+    bool* modified) {
   if (VLOG_IS_ON(4)) {
     DumpGraphToFile("extract_outside_compilation_before", *g, fld);
   }
 
+  *modified = false;
   auto node_name_index = g->BuildNodeNameIndex();
   for (auto& iter : clusters) {
     string xla_cluster_name = iter.first;
@@ -1711,6 +1743,7 @@ Status ExtractOutsideCompilation(
         func_name_attrs, func_name_attrs.name(), host_graph_func_name,
         host_compute_core, flr, fld, &shape_inference_graphs,
         &has_outside_compilation));
+    *modified |= has_outside_compilation;
 
     string pivot_name = absl::StrCat(xla_cluster_name, "/pivot");
     Node* pivot_node = node_name_index[pivot_name];
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass.h b/tensorflow/compiler/jit/extract_outside_compilation_pass.h
index d64cc2a103e..0a29fdaa5c8 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass.h
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass.h
@@ -101,7 +101,8 @@ Status ExtractOutsideCompilation(
     const string& xla_cluster_attr_name,
     const string& outside_compilation_attr_name,
     const std::unordered_map<string, XlaClusterInfo>& clusters, Graph* g,
-    FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld);
+    FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld,
+    bool* modified);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
index b722faf1f3f..93817378e96 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
@@ -922,4 +922,145 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInFunction) {
   }
 }
 
+TEST_F(ExtractOutsideCompilationForFunctionTest,
+       OutsideCompilationClusterDataDependency) {
+  // Build the XLA computation func.
+  // "const0"
+  // "identity0" = "const0" (outside compilation cluster "0")
+  // "identity1" = "identity0" (outside compilation cluster "1")
+  // "identity2" = "identity1"
+  FunctionDefLibrary fdl;
+  {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output const0 = ops::Const(s.WithOpName("const0"), 1, {2});
+    Output identity0 = ops::Identity(s.WithOpName("identity0"), const0);
+    Output identity1 = ops::Identity(s.WithOpName("identity1"), identity0);
+    Output identity2 = ops::Identity(s.WithOpName("identity2"), identity1);
+    std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+    TF_CHECK_OK(s.ToGraph(g.get()));
+    std::cout << "Graph is " << (*g).ToGraphDefDebug().DebugString()
+              << std::endl;
+    auto node_name_image = g->BuildNodeNameIndex();
+    node_name_image["identity0"]->AddAttr("_oc", "0");
+    node_name_image["identity1"]->AddAttr("_oc", "1");
+
+    PartialTensorShape shape({2});
+    node_name_image["identity1"]->AddAttr(
+        kXlaInferredShapesAttrName, std::vector<PartialTensorShape>{shape});
+
+    FunctionDef *xla_fdef = fdl.add_function();
+    TF_CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
+  }
+  FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
+
+  protobuf::Map<string, tensorflow::AttrValue> attrs;
+  std::map<string, int> host_compute_core = {{"0", 1}, {"1", 0}};
+  std::vector<string> shape_inference_graphs;
+  bool has_outside_compilation;
+  NameAttrList name_attrs;
+  name_attrs.set_name("cluster");
+  *name_attrs.mutable_attr() = attrs;
+  TF_CHECK_OK(ExtractOutsideCompilationTest(
+      "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
+      host_compute_core, &fld, &shape_inference_graphs,
+      &has_outside_compilation));
+
+  // Get rewritten XLA computation function.
+  std::unique_ptr<FunctionBody> xla_fbody;
+  TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("cluster_rewritten"),
+                                      AttrSlice(), &fld, &xla_fbody));
+  auto node_name_index = xla_fbody->graph->BuildNodeNameIndex();
+
+  // Check XlaHostCompute nodes.
+  Node *host_compute_0 = node_name_index["outside_compilation_0_host_compute"];
+  EXPECT_NE(host_compute_0, nullptr);
+  Node *host_compute_1 = node_name_index["outside_compilation_1_host_compute"];
+  EXPECT_NE(host_compute_1, nullptr);
+
+  // Check XlaHostCompute nodes' "_xla_token_input_nodes" attr.
+  std::vector<string> token_input_nodes;
+  TF_CHECK_OK(GetNodeAttr(AttrSlice(host_compute_0->attrs()),
+                          "_xla_token_input_nodes", &token_input_nodes));
+
+  std::vector<string> expected_token_input_nodes_0({"_xla_token_arg_node"});
+  EXPECT_EQ(token_input_nodes, expected_token_input_nodes_0);
+  token_input_nodes.clear();
+  std::vector<string> expected_token_input_nodes_1(
+      {"_xla_token_arg_node", "outside_compilation_0_host_compute"});
+  TF_CHECK_OK(GetNodeAttr(AttrSlice(host_compute_1->attrs()),
+                          "_xla_token_input_nodes", &token_input_nodes));
+  EXPECT_EQ(token_input_nodes, expected_token_input_nodes_1);
+}
+
+TEST_F(ExtractOutsideCompilationForFunctionTest,
+       OutsideCompilationClusterControlDependency) {
+  // Build the XLA computation func.
+  // "const0"
+  // "identity0" = "const0" (outside compilation cluster "0")
+  // "identity1" = "const0" "^identity0" (outside compilation cluster "1",
+  //                                      control depdent on cluster "0")
+  // "identity2" = "identity1"
+  FunctionDefLibrary fdl;
+  {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output const0 = ops::Const(s.WithOpName("const0"), 1, {2});
+    Output identity0 = ops::Identity(s.WithOpName("identity0"), const0);
+    Output identity1 = ops::Identity(
+        s.WithOpName("identity1").WithControlDependencies(identity0), const0);
+    Output identity2 = ops::Identity(s.WithOpName("identity2"), identity1);
+    std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+    TF_CHECK_OK(s.ToGraph(g.get()));
+    std::cout << "Graph is " << (*g).ToGraphDefDebug().DebugString()
+              << std::endl;
+    auto node_name_image = g->BuildNodeNameIndex();
+    node_name_image["identity0"]->AddAttr("_oc", "0");
+    node_name_image["identity1"]->AddAttr("_oc", "1");
+
+    PartialTensorShape shape({2});
+    node_name_image["identity1"]->AddAttr(
+        kXlaInferredShapesAttrName, std::vector<PartialTensorShape>{shape});
+
+    FunctionDef *xla_fdef = fdl.add_function();
+    TF_CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
+  }
+  FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
+
+  protobuf::Map<string, tensorflow::AttrValue> attrs;
+  std::map<string, int> host_compute_core = {{"0", 1}, {"1", 0}};
+  std::vector<string> shape_inference_graphs;
+  bool has_outside_compilation;
+  NameAttrList name_attrs;
+  name_attrs.set_name("cluster");
+  *name_attrs.mutable_attr() = attrs;
+  TF_CHECK_OK(ExtractOutsideCompilationTest(
+      "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
+      host_compute_core, &fld, &shape_inference_graphs,
+      &has_outside_compilation));
+
+  // Get rewritten XLA computation function.
+  std::unique_ptr<FunctionBody> xla_fbody;
+  TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("cluster_rewritten"),
+                                      AttrSlice(), &fld, &xla_fbody));
+  auto node_name_index = xla_fbody->graph->BuildNodeNameIndex();
+
+  // Check XlaHostCompute nodes.
+  Node *host_compute_0 = node_name_index["outside_compilation_0_host_compute"];
+  EXPECT_NE(host_compute_0, nullptr);
+  Node *host_compute_1 = node_name_index["outside_compilation_1_host_compute"];
+  EXPECT_NE(host_compute_1, nullptr);
+
+  // Check XlaHostCompute nodes' "_xla_token_input_nodes" attr.
+  std::vector<string> token_input_nodes;
+  TF_CHECK_OK(GetNodeAttr(AttrSlice(host_compute_0->attrs()),
+                          "_xla_token_input_nodes", &token_input_nodes));
+
+  std::vector<string> expected_token_input_nodes_0({"_xla_token_arg_node"});
+  EXPECT_EQ(token_input_nodes, expected_token_input_nodes_0);
+  token_input_nodes.clear();
+  std::vector<string> expected_token_input_nodes_1(
+      {"_xla_token_arg_node", "outside_compilation_0_host_compute"});
+  TF_CHECK_OK(GetNodeAttr(AttrSlice(host_compute_1->attrs()),
+                          "_xla_token_input_nodes", &token_input_nodes));
+  EXPECT_EQ(token_input_nodes, expected_token_input_nodes_1);
+}
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/graphcycles/BUILD b/tensorflow/compiler/jit/graphcycles/BUILD
index 490555c51b9..f9be7c45743 100644
--- a/tensorflow/compiler/jit/graphcycles/BUILD
+++ b/tensorflow/compiler/jit/graphcycles/BUILD
@@ -13,11 +13,23 @@ cc_library(
     srcs = ["graphcycles.cc"],
     hdrs = ["graphcycles.h"],
     deps = [
+        ":ordered_set",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "ordered_set",
+    hdrs = ["ordered_set.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -31,3 +43,14 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
+
+tf_cc_test(
+    name = "ordered_set_test",
+    srcs = ["ordered_set_test.cc"],
+    deps = [
+        ":ordered_set",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/compiler/jit/graphcycles/graphcycles.cc b/tensorflow/compiler/jit/graphcycles/graphcycles.cc
index 3d5e0156b2a..f5655ff71a1 100644
--- a/tensorflow/compiler/jit/graphcycles/graphcycles.cc
+++ b/tensorflow/compiler/jit/graphcycles/graphcycles.cc
@@ -38,13 +38,16 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/jit/graphcycles/ordered_set.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 
 namespace {
 
-typedef std::unordered_set<int32> NodeSet;
+using NodeSet = absl::flat_hash_set<int32>;
+using OrderedNodeSet = OrderedSet<int32>;
+
 template <typename T>
 struct VecStruct {
   typedef absl::InlinedVector<T, 4> type;
@@ -53,13 +56,11 @@ template <typename T>
 using Vec = typename VecStruct<T>::type;
 
 struct Node {
-  Node() : in(4), out(4) {}  // Small hashtables for in/out edges
-
   int32 rank;    // rank number assigned by Pearce-Kelly algorithm
   bool visited;  // Temporary marker used by depth-first-search
   void* data;    // User-supplied data
-  NodeSet in;    // List of immediate predecessor nodes in graph
-  NodeSet out;   // List of immediate successor nodes in graph
+  OrderedNodeSet in;   // List of immediate predecessor nodes in graph
+  OrderedNodeSet out;  // List of immediate successor nodes in graph
 };
 
 }  // namespace
@@ -96,7 +97,7 @@ bool GraphCycles::CheckInvariants() const {
     if (!ranks.insert(nx->rank).second) {
       LOG(FATAL) << "Duplicate occurrence of rank " << nx->rank;
     }
-    for (auto y : nx->out) {
+    for (int32 y : nx->out.GetSequence()) {
       Node* ny = r->nodes_[y];
       if (nx->rank >= ny->rank) {
         LOG(FATAL) << "Edge " << x << "->" << y << " has bad rank assignment "
@@ -127,14 +128,14 @@ int32 GraphCycles::NewNode() {
 
 void GraphCycles::RemoveNode(int32 node) {
   Node* x = rep_->nodes_[node];
-  for (auto y : x->out) {
-    rep_->nodes_[y]->in.erase(node);
+  for (int32 y : x->out.GetSequence()) {
+    rep_->nodes_[y]->in.Erase(node);
   }
-  for (auto y : x->in) {
-    rep_->nodes_[y]->out.erase(node);
+  for (int32 y : x->in.GetSequence()) {
+    rep_->nodes_[y]->out.Erase(node);
   }
-  x->in.clear();
-  x->out.clear();
+  x->in.Clear();
+  x->out.Clear();
   rep_->free_nodes_.push_back(node);
 }
 
@@ -147,12 +148,12 @@ void GraphCycles::SetNodeData(int32 node, void* data) {
 }
 
 bool GraphCycles::HasEdge(int32 x, int32 y) const {
-  return rep_->nodes_[x]->out.find(y) != rep_->nodes_[x]->out.end();
+  return rep_->nodes_[x]->out.Contains(y);
 }
 
 void GraphCycles::RemoveEdge(int32 x, int32 y) {
-  rep_->nodes_[x]->out.erase(y);
-  rep_->nodes_[y]->in.erase(x);
+  rep_->nodes_[x]->out.Erase(y);
+  rep_->nodes_[y]->in.Erase(x);
   // No need to update the rank assignment since a previous valid
   // rank assignment remains valid after an edge deletion.
 }
@@ -168,13 +169,13 @@ bool GraphCycles::InsertEdge(int32 x, int32 y) {
   if (x == y) return false;
   Rep* r = rep_;
   Node* nx = r->nodes_[x];
-  if (!nx->out.insert(y).second) {
+  if (!nx->out.Insert(y)) {
     // Edge already exists.
     return true;
   }
 
   Node* ny = r->nodes_[y];
-  ny->in.insert(x);
+  ny->in.Insert(x);
 
   if (nx->rank <= ny->rank) {
     // New edge is consistent with existing rank assignment.
@@ -185,8 +186,8 @@ bool GraphCycles::InsertEdge(int32 x, int32 y) {
   // We only need to consider nodes that fall in the range [ny->rank,nx->rank].
   if (!ForwardDFS(r, y, nx->rank)) {
     // Found a cycle.  Undo the insertion and tell caller.
-    nx->out.erase(y);
-    ny->in.erase(x);
+    nx->out.Erase(y);
+    ny->in.Erase(x);
     // Since we do not call Reorder() on this path, clear any visited
     // markers left by ForwardDFS.
     ClearVisitedBits(r, r->deltaf_);
@@ -212,7 +213,7 @@ static bool ForwardDFS(GraphCycles::Rep* r, int32 n, int32 upper_bound) {
     nn->visited = true;
     r->deltaf_.push_back(n);
 
-    for (auto w : nn->out) {
+    for (auto w : nn->out.GetSequence()) {
       Node* nw = r->nodes_[w];
       if (nw->rank == upper_bound) {
         return false;  // Cycle
@@ -238,7 +239,7 @@ static void BackwardDFS(GraphCycles::Rep* r, int32 n, int32 lower_bound) {
     nn->visited = true;
     r->deltab_.push_back(n);
 
-    for (auto w : nn->in) {
+    for (auto w : nn->in.GetSequence()) {
       Node* nw = r->nodes_[w];
       if (!nw->visited && lower_bound < nw->rank) {
         r->stack_.push_back(w);
@@ -324,7 +325,7 @@ int GraphCycles::FindPath(int32 x, int32 y, int max_path_len,
       return path_len;
     }
 
-    for (auto w : r->nodes_[n]->out) {
+    for (auto w : r->nodes_[n]->out.GetSequence()) {
       if (seen.insert(w).second) {
         r->stack_.push_back(w);
       }
@@ -378,31 +379,35 @@ bool GraphCycles::ContractEdge(int32 a, int32 b) {
   }
 
   Node* nb = rep_->nodes_[b];
-  std::unordered_set<int32> out = std::move(nb->out);
-  std::unordered_set<int32> in = std::move(nb->in);
-  for (auto y : out) {
-    rep_->nodes_[y]->in.erase(b);
+  OrderedNodeSet out = std::move(nb->out);
+  OrderedNodeSet in = std::move(nb->in);
+  for (int32 y : out.GetSequence()) {
+    rep_->nodes_[y]->in.Erase(b);
   }
-  for (auto y : in) {
-    rep_->nodes_[y]->out.erase(b);
+  for (int32 y : in.GetSequence()) {
+    rep_->nodes_[y]->out.Erase(b);
   }
   rep_->free_nodes_.push_back(b);
 
-  for (auto y : out) {
+  rep_->nodes_[a]->out.Reserve(rep_->nodes_[a]->out.Size() + out.Size());
+  for (int32 y : out.GetSequence()) {
     InsertEdge(a, y);
   }
-  for (auto y : in) {
+
+  rep_->nodes_[a]->in.Reserve(rep_->nodes_[a]->in.Size() + in.Size());
+  for (int32 y : in.GetSequence()) {
     InsertEdge(y, a);
   }
+
   return true;
 }
 
-std::unordered_set<int32> GraphCycles::Successors(int32 node) const {
-  return rep_->nodes_[node]->out;
+absl::Span<const int32> GraphCycles::Successors(int32 node) const {
+  return rep_->nodes_[node]->out.GetSequence();
 }
 
-std::unordered_set<int32> GraphCycles::Predecessors(int32 node) const {
-  return rep_->nodes_[node]->in;
+absl::Span<const int32> GraphCycles::Predecessors(int32 node) const {
+  return rep_->nodes_[node]->in.GetSequence();
 }
 
 namespace {
@@ -444,7 +449,7 @@ string GraphCycles::DebugString() const {
       continue;
     }
 
-    for (int32 succ : rep_->nodes_[i]->out) {
+    for (int32 succ : rep_->nodes_[i]->out.GetSequence()) {
       absl::StrAppend(&result, "  \"", i, "\" -> \"", succ, "\"\n");
     }
   }
diff --git a/tensorflow/compiler/jit/graphcycles/graphcycles.h b/tensorflow/compiler/jit/graphcycles/graphcycles.h
index baa9c61f5c7..28f7fbb1ef8 100644
--- a/tensorflow/compiler/jit/graphcycles/graphcycles.h
+++ b/tensorflow/compiler/jit/graphcycles/graphcycles.h
@@ -40,8 +40,7 @@ limitations under the License.
 //   FindPath() is linear in the size of the graph.
 // The current implementation uses O(|V|+|E|) space.
 
-#include <unordered_set>
-
+#include "absl/types/span.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -119,8 +118,8 @@ class GraphCycles {
   // Expensive: should only be called from graphcycles_test.cc.
   bool CheckInvariants() const;
 
-  std::unordered_set<int32> Successors(int32 node) const;
-  std::unordered_set<int32> Predecessors(int32 node) const;
+  absl::Span<const int32> Successors(int32 node) const;
+  absl::Span<const int32> Predecessors(int32 node) const;
 
   // Returns all nodes in post order.
   //
diff --git a/tensorflow/compiler/jit/graphcycles/ordered_set.h b/tensorflow/compiler/jit/graphcycles/ordered_set.h
new file mode 100644
index 00000000000..0417782b984
--- /dev/null
+++ b/tensorflow/compiler/jit/graphcycles/ordered_set.h
@@ -0,0 +1,85 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_GRAPHCYCLES_ORDERED_SET_H_
+#define TENSORFLOW_COMPILER_JIT_GRAPHCYCLES_ORDERED_SET_H_
+
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+// This is a set data structure that provides a deterministic iteration order.
+// The iteration order of elements only depends on the sequence of
+// inserts/deletes, so as long as the inserts/deletes happen in the same
+// sequence, the set will have the same iteration order.
+//
+// Assumes that T can be cheaply copied for simplicity.
+template <typename T>
+class OrderedSet {
+ public:
+  // Inserts `value` into the ordered set.  Returns true if the value was not
+  // present in the set before the insertion.
+  bool Insert(T value) {
+    bool new_insertion =
+        value_to_index_.insert({value, value_sequence_.size()}).second;
+    if (new_insertion) {
+      value_sequence_.push_back(value);
+    }
+    return new_insertion;
+  }
+
+  // Removes `value` from the set.  Assumes `value` is already present in the
+  // set.
+  void Erase(T value) {
+    auto it = value_to_index_.find(value);
+    DCHECK(it != value_to_index_.end());
+
+    // Since we don't want to move values around in `value_sequence_` we swap
+    // the value in the last position and with value to be deleted and then
+    // pop_back.
+    value_to_index_[value_sequence_.back()] = it->second;
+    std::swap(value_sequence_[it->second], value_sequence_.back());
+    value_sequence_.pop_back();
+    value_to_index_.erase(it);
+  }
+
+  void Reserve(size_t new_size) {
+    value_to_index_.reserve(new_size);
+    value_sequence_.reserve(new_size);
+  }
+
+  void Clear() {
+    value_to_index_.clear();
+    value_sequence_.clear();
+  }
+
+  bool Contains(T value) const { return value_to_index_.contains(value); }
+  size_t Size() const { return value_sequence_.size(); }
+
+  absl::Span<T const> GetSequence() const { return value_sequence_; }
+
+ private:
+  // The stable order that we maintain through insertions and deletions.
+  std::vector<T> value_sequence_;
+
+  // Maps values to their indices in `value_sequence_`.
+  absl::flat_hash_map<T, int> value_to_index_;
+};
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_GRAPHCYCLES_ORDERED_SET_H_
diff --git a/tensorflow/compiler/jit/graphcycles/ordered_set_test.cc b/tensorflow/compiler/jit/graphcycles/ordered_set_test.cc
new file mode 100644
index 00000000000..38ac1cfe9b6
--- /dev/null
+++ b/tensorflow/compiler/jit/graphcycles/ordered_set_test.cc
@@ -0,0 +1,117 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/graphcycles/ordered_set.h"
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace {
+TEST(OrderedSetTest, Insert) {
+  OrderedSet<int> ordered_set;
+  EXPECT_TRUE(ordered_set.Insert(90));
+  EXPECT_TRUE(ordered_set.Insert(100));
+  EXPECT_TRUE(ordered_set.Insert(80));
+
+  EXPECT_FALSE(ordered_set.Insert(100));
+
+  EXPECT_EQ(ordered_set.Size(), 3);
+
+  EXPECT_TRUE(ordered_set.Contains(90));
+  EXPECT_TRUE(ordered_set.Contains(100));
+  EXPECT_TRUE(ordered_set.Contains(80));
+
+  EXPECT_FALSE(ordered_set.Contains(40));
+
+  std::array<int, 3> expected_sequence = {90, 100, 80};
+  EXPECT_EQ(ordered_set.GetSequence(), expected_sequence);
+}
+
+TEST(OrderedSetTest, Erase) {
+  OrderedSet<int> ordered_set;
+  EXPECT_TRUE(ordered_set.Insert(90));
+  EXPECT_TRUE(ordered_set.Insert(100));
+  EXPECT_TRUE(ordered_set.Insert(80));
+
+  ordered_set.Erase(100);
+
+  EXPECT_EQ(ordered_set.Size(), 2);
+
+  EXPECT_TRUE(ordered_set.Contains(90));
+  EXPECT_FALSE(ordered_set.Contains(100));
+  EXPECT_TRUE(ordered_set.Contains(80));
+
+  std::array<int, 2> expected_sequence_0 = {90, 80};
+  EXPECT_EQ(ordered_set.GetSequence(), expected_sequence_0);
+
+  ordered_set.Erase(80);
+
+  EXPECT_EQ(ordered_set.Size(), 1);
+
+  EXPECT_TRUE(ordered_set.Contains(90));
+  EXPECT_FALSE(ordered_set.Contains(100));
+  EXPECT_FALSE(ordered_set.Contains(80));
+
+  std::array<int, 1> expected_sequence_1 = {90};
+  EXPECT_EQ(ordered_set.GetSequence(), expected_sequence_1);
+
+  ordered_set.Erase(90);
+
+  EXPECT_EQ(ordered_set.Size(), 0);
+
+  EXPECT_FALSE(ordered_set.Contains(90));
+  EXPECT_FALSE(ordered_set.Contains(100));
+  EXPECT_FALSE(ordered_set.Contains(80));
+
+  std::array<int, 0> expected_sequence_2 = {};
+  EXPECT_EQ(ordered_set.GetSequence(), expected_sequence_2);
+}
+
+TEST(OrderedSetTest, Clear) {
+  OrderedSet<int> ordered_set;
+  EXPECT_TRUE(ordered_set.Insert(90));
+  EXPECT_TRUE(ordered_set.Insert(100));
+  EXPECT_TRUE(ordered_set.Insert(80));
+
+  ordered_set.Clear();
+
+  EXPECT_EQ(ordered_set.Size(), 0);
+
+  EXPECT_FALSE(ordered_set.Contains(90));
+  EXPECT_FALSE(ordered_set.Contains(100));
+  EXPECT_FALSE(ordered_set.Contains(80));
+
+  std::array<int, 0> expected_sequence = {};
+  EXPECT_EQ(ordered_set.GetSequence(), expected_sequence);
+}
+
+TEST(OrderedSetTest, LargeInsertions) {
+  const int kSize = 50 * 9000;
+
+  OrderedSet<int> ordered_set;
+
+  for (int i = 0; i < kSize; i++) {
+    EXPECT_TRUE(ordered_set.Insert(i + 500));
+  }
+
+  for (int i = 0; i < kSize; i++) {
+    EXPECT_EQ(ordered_set.GetSequence()[i], i + 500);
+  }
+}
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index 88d00f7f8e1..6df0991e354 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -62,7 +62,7 @@ XlaPlatformInfo PlatformInfoFromContext(OpKernelConstruction* ctx) {
   se::Platform::Id platform_id = nullptr;
   const XlaDevice::Metadata* xla_device_metadata = nullptr;
   std::unique_ptr<XlaAllocator> xla_allocator;
-  xla::DeviceMemoryAllocator* device_allocator = nullptr;
+  se::DeviceMemoryAllocator* device_allocator = nullptr;
 
   if (ctx->device_type() == DeviceType(DEVICE_CPU)) {
     platform_id = se::host::kHostPlatformId;
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.h b/tensorflow/compiler/jit/kernels/xla_ops.h
index 7b4d4b5b473..eaa686780e4 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.h
+++ b/tensorflow/compiler/jit/kernels/xla_ops.h
@@ -40,7 +40,7 @@ class XlaPlatformInfo {
                            se::Platform::Id platform_id,
                            const XlaDevice::Metadata* xla_device_metadata,
                            std::unique_ptr<XlaAllocator> xla_allocator,
-                           xla::DeviceMemoryAllocator* device_allocator)
+                           se::DeviceMemoryAllocator* device_allocator)
       : device_type_(device_type),
         platform_id_(platform_id),
         xla_device_metadata_(xla_device_metadata),
@@ -55,7 +55,7 @@ class XlaPlatformInfo {
     return xla_device_metadata_ && xla_device_metadata_->UseMultipleStreams();
   }
 
-  xla::DeviceMemoryAllocator* allocator() const {
+  se::DeviceMemoryAllocator* allocator() const {
     return device_allocator_ ? device_allocator_ : xla_allocator_.get();
   }
   DeviceType device_type() const { return device_type_; }
@@ -86,7 +86,7 @@ class XlaPlatformInfo {
   // then device_allocator_ is null and xla_allocator_ points to an appropriate
   // XlaAllocator instance.
   std::unique_ptr<XlaAllocator> xla_allocator_;
-  xla::DeviceMemoryAllocator* device_allocator_;
+  se::DeviceMemoryAllocator* device_allocator_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(XlaPlatformInfo);
 };
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index a9713f8ea3c..1952a4e0464 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -270,7 +270,7 @@ class MarkForCompilationPassImpl {
   StatusOr<bool> ShouldCompileCluster(const Cluster& cluster);
 
   StatusOr<bool> ClusteringWillIntroduceInterDeviceDependency(
-      const Cluster& to);
+      const Cluster& from, const Cluster& to);
 
   // Returns true if the devices in `cluster_a` and `cluster_b` are compatible
   // and therefore not a hindrance for combining the two clusters into a larger
@@ -698,7 +698,7 @@ Status MarkForCompilationPassImpl::DumpDebugInfo() {
 
 StatusOr<bool>
 MarkForCompilationPassImpl::ClusteringWillIntroduceInterDeviceDependency(
-    const Cluster& cluster_to) {
+    const Cluster& cluster_from, const Cluster& cluster_to) {
   // If any of the consumer's producers are on a different device, do not
   // cluster these nodes. This prevents other work on this device from being
   // delayed by work on other devices. We consider predecessors of the entire
@@ -722,6 +722,11 @@ MarkForCompilationPassImpl::ClusteringWillIntroduceInterDeviceDependency(
       if (!devices_compatible) {
         return true;
       }
+      TF_ASSIGN_OR_RETURN(devices_compatible,
+                          AreDevicesCompatible(cluster_from, *cluster_in));
+      if (!devices_compatible) {
+        return true;
+      }
     }
   }
 
@@ -1026,7 +1031,7 @@ StatusOr<bool> MarkForCompilationPassImpl::TryToContractEdge(Cluster* from,
   }
 
   TF_ASSIGN_OR_RETURN(bool will_introduce_cross_device_dependency,
-                      ClusteringWillIntroduceInterDeviceDependency(*to));
+                      ClusteringWillIntroduceInterDeviceDependency(*from, *to));
 
   if (will_introduce_cross_device_dependency) {
     return LogNotContractableAndReturnFalse(
@@ -1062,8 +1067,16 @@ StatusOr<bool> MarkForCompilationPassImpl::TryToContractEdge(Cluster* from,
 StatusOr<bool> MarkForCompilationPassImpl::TryToContractEdgesFrom(
     Cluster* cluster_from) {
   bool changed = false;
-  for (int to :
-       cycles_graph_.Successors(cluster_from->cycles_graph_node_id())) {
+
+  // Make a copy of the set of successors because we may modify the graph in
+  // TryToContractEdge.
+  std::vector<int32> successors_copy = [&] {
+    absl::Span<const int32> successors =
+        cycles_graph_.Successors(cluster_from->cycles_graph_node_id());
+    return std::vector<int32>(successors.begin(), successors.end());
+  }();
+
+  for (int to : successors_copy) {
     iteration_count_++;
     if (to >= graph_->num_node_ids()) {
       // Node is a fictitious node that is present only in the cycle detection
@@ -1265,19 +1278,15 @@ StatusOr<bool> MarkForCompilationPassImpl::AreDevicesCompatible(
   DeviceSet devices = cluster_a.devices();
   devices.UnionWith(cluster_b.devices());
 
-  // First check if we will even be able to pick a device for the larger
-  // combined cluster.
   TF_ASSIGN_OR_RETURN(
-      bool can_pick_device,
-      CanPickDeviceForXla(device_info_cache_, devices,
-                          /*allow_mixing_unknown_and_cpu=*/false));
-  if (!can_pick_device) {
+      absl::optional<jit::DeviceId> maybe_chosen_device,
+      MaybePickDeviceForXla(device_info_cache_, devices,
+                            /*allow_mixing_unknown_and_cpu=*/false));
+  if (!maybe_chosen_device.has_value()) {
     return false;
   }
 
-  TF_ASSIGN_OR_RETURN(DeviceId chosen_device,
-                      PickDeviceForXla(device_info_cache_, devices,
-                                       /*allow_mixing_unknown_and_cpu=*/false));
+  jit::DeviceId chosen_device = *maybe_chosen_device;
 
   // If we are able to pick a device `chosen_device` for the larger cluster, the
   // resource operations in `cluster_a` and `cluster_b` must be placed on the
@@ -1415,7 +1424,7 @@ bool IsCompilable(FunctionLibraryRuntime* flr, const NodeDef& ndef) {
   op_filter.allow_control_trigger = true;
   op_filter.allow_eliding_assert_and_checknumerics_ops = true;
   op_filter.allow_ops_producing_or_consuming_variant = true;
-  op_filter.allow_svd_op = true;
+  op_filter.allow_slow_and_inaccurate_ops = true;
 
   return RecursiveCompilabilityChecker{&op_filter, &jit_device_type}
       .IsCompilableCall(ndef, flr);
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index 005b846cd38..8b14dadcf5b 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -1112,6 +1112,45 @@ TEST(XlaCompilationTest, DontClusterMergingNodes) {
   EXPECT_EQ(clusters["B_dev1"], clusters["MatMul1_dev1"]);
 }
 
+TEST(XlaCompilationTest, DontClusterMergingNodesOnCPU) {
+  // This is similar to the 'DontClusterMergingNodes' above, except
+  // MatMulCombined is placed on the CPU.
+  Scope root = Scope::NewRootScope().ExitOnError();
+  absl::string_view xla_gpu_dev0 = "/job:worker/replica:0/task:0/device:GPU:0";
+  absl::string_view xla_gpu_dev1 = "/job:worker/replica:0/task:0/device:GPU:1";
+  absl::string_view xla_cpu_dev0 = "/job:worker/replica:0/task:0/device:CPU:0";
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  Output a = ops::Tanh(root.WithOpName("tanh_A_dev0"),
+                       ops::Const(root.WithOpName("A_dev0"), 1.0f, {2, 2}));
+  Output b = ops::Tanh(root.WithOpName("tanh_B_dev1"),
+                       ops::Const(root.WithOpName("B_dev1"), 1.0f, {2, 2}));
+  Output matmul0 = ops::MatMul(root.WithOpName("MatMul0_dev0"), a, a);
+  Output matmul1 = ops::MatMul(root.WithOpName("MatMul1_dev1"), b, b);
+
+  Output combined =
+      ops::MatMul(root.WithOpName("MatMulCombined_cpu"), matmul0, matmul1);
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  for (Node* n : graph->nodes()) {
+    if (absl::EndsWith(n->name(), /*suffix=*/"cpu")) {
+      n->set_assigned_device_name(string(xla_cpu_dev0));
+    } else if (absl::EndsWith(n->name(), /*suffix=*/"dev0")) {
+      n->set_assigned_device_name(string(xla_gpu_dev0));
+    } else if (absl::EndsWith(n->name(), /*suffix=*/"dev1")) {
+      n->set_assigned_device_name(string(xla_gpu_dev1));
+    }
+  }
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  // Each of the MatMuls should be in a separate cluster.
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  EXPECT_NE(clusters["MatMul0_dev0"], clusters["MatMul1_dev1"]);
+  EXPECT_NE(clusters["MatMulCombined_cpu"], clusters["MatMul0_dev0"]);
+  EXPECT_NE(clusters["MatMulCombined_cpu"], clusters["MatMul1_dev1"]);
+  EXPECT_EQ(clusters["A_dev0"], clusters["MatMul0_dev0"]);
+  EXPECT_EQ(clusters["B_dev1"], clusters["MatMul1_dev1"]);
+}
+
 // TODO(b/117085735): This form of clustering should be prevented.
 TEST(XlaCompilationTest, NOT_DontClusterSpreadingNodes) {
   // MatMulSource below creates data for nodes on GPU0 and GPU1 and is placed
diff --git a/tensorflow/compiler/jit/xla_cpu_device.cc b/tensorflow/compiler/jit/xla_cpu_device.cc
index eb18ec4be1a..19e3793f29b 100644
--- a/tensorflow/compiler/jit/xla_cpu_device.cc
+++ b/tensorflow/compiler/jit/xla_cpu_device.cc
@@ -60,7 +60,7 @@ Status XlaCpuDeviceFactory::CreateDevices(
   registration.cluster_control_trigger = true;
   registration.elide_assert_and_checknumerics = true;
   registration.cluster_variant_ops = true;
-  registration.cluster_svd_op = true;
+  registration.cluster_slow_and_inaccurate_ops = true;
   XlaOpRegistry::RegisterCompilationDevice(DEVICE_XLA_CPU, registration);
 
   static XlaDeviceOpRegistrations* registrations =
diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc
index b0ad8221130..913612f9a6c 100644
--- a/tensorflow/compiler/jit/xla_gpu_device.cc
+++ b/tensorflow/compiler/jit/xla_gpu_device.cc
@@ -95,7 +95,7 @@ Status XlaGpuDeviceFactory::CreateDevices(
   registration.cluster_control_trigger = true;
   registration.elide_assert_and_checknumerics = true;
   registration.cluster_variant_ops = true;
-  registration.cluster_svd_op = true;
+  registration.cluster_slow_and_inaccurate_ops = true;
   XlaOpRegistry::RegisterCompilationDevice(DEVICE_XLA_GPU, registration);
 
   static XlaDeviceOpRegistrations* registrations =
diff --git a/tensorflow/compiler/jit/xla_interpreter_device.cc b/tensorflow/compiler/jit/xla_interpreter_device.cc
index 9844cd5cc39..4252e2e24ac 100644
--- a/tensorflow/compiler/jit/xla_interpreter_device.cc
+++ b/tensorflow/compiler/jit/xla_interpreter_device.cc
@@ -63,7 +63,7 @@ Status XlaInterpreterDeviceFactory::CreateDevices(
   registration.cluster_control_trigger = true;
   registration.elide_assert_and_checknumerics = true;
   registration.cluster_variant_ops = true;
-  registration.cluster_svd_op = true;
+  registration.cluster_slow_and_inaccurate_ops = true;
   XlaOpRegistry::RegisterCompilationDevice(DEVICE_XLA_INTERPRETER,
                                            registration);
 
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 777763342a6..3bb698b33d6 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -168,11 +168,11 @@ Status SnapshotResourceVariables(OpKernelContext* ctx,
 }
 
 XlaAllocator::XlaAllocator(const se::Platform* platform, Allocator* wrapped)
-    : xla::DeviceMemoryAllocator(platform), wrapped_(wrapped) {}
+    : se::DeviceMemoryAllocator(platform), wrapped_(wrapped) {}
 
 XlaAllocator::~XlaAllocator() {}
 
-xla::StatusOr<xla::OwningDeviceMemory> XlaAllocator::Allocate(
+xla::StatusOr<se::OwningDeviceMemory> XlaAllocator::Allocate(
     int device_ordinal, uint64 size, bool retry_on_failure) {
   AllocationAttributes attrs;
   attrs.no_retry_on_failure = !retry_on_failure;
@@ -184,8 +184,8 @@ xla::StatusOr<xla::OwningDeviceMemory> XlaAllocator::Allocate(
           "Out of memory while trying to allocate ", size, " bytes.");
     }
   }
-  return xla::OwningDeviceMemory(se::DeviceMemoryBase(data, size),
-                                 device_ordinal, this);
+  return se::OwningDeviceMemory(se::DeviceMemoryBase(data, size),
+                                device_ordinal, this);
 }
 
 Status XlaAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase mem) {
@@ -194,7 +194,7 @@ Status XlaAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase mem) {
 }
 
 XlaComputationLaunchContext::XlaComputationLaunchContext(
-    xla::LocalClient* client, xla::DeviceMemoryAllocator* xla_allocator,
+    xla::LocalClient* client, se::DeviceMemoryAllocator* xla_allocator,
     bool allocate_xla_tensors, bool use_multiple_streams)
     : client_(client),
       xla_allocator_(xla_allocator),
@@ -374,7 +374,7 @@ Status XlaComputationLaunchContext::PopulateOutputs(
         } else {
           Tensor output_tensor = XlaTensorBuffer::MakeTensor(
               ctx->expected_output_dtype(i), shape, buffer, allocator);
-          output.set_buffer(xla::OwningDeviceMemory(), {output_num});
+          output.set_buffer(se::OwningDeviceMemory(), {output_num});
           ctx->set_output(i, output_tensor);
         }
         ++output_num;
@@ -435,7 +435,7 @@ Status XlaComputationLaunchContext::PopulateOutputs(
       *variable_infos[i].var()->tensor() = output_tensor;
     } else {
       se::DeviceMemoryBase buffer = output.buffer({output_num});
-      output.set_buffer(xla::OwningDeviceMemory(), {output_num});
+      output.set_buffer(se::OwningDeviceMemory(), {output_num});
       Tensor output_tensor = XlaTensorBuffer::MakeTensor(
           write.type, write.shape, buffer, allocator);
       *variable_infos[i].var()->tensor() = output_tensor;
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index c915b7118d0..c6a9b931401 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -23,14 +23,14 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_tensor.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
-#include "tensorflow/compiler/xla/service/owning_device_memory.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/resource_var.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
+#include "tensorflow/stream_executor/owning_device_memory.h"
 
 namespace tensorflow {
 class XlaAllocator;
@@ -108,11 +108,11 @@ Status LockVariables(absl::Span<VariableInfo> variables)
 // Adapter class that wraps a Tensorflow allocator as an XLA allocator.
 // Assumes that the Tensorflow allocator permits asynchronous deallocation:
 // see comment on `AllowsAsynchronousDeallocation()`.
-class XlaAllocator : public xla::DeviceMemoryAllocator {
+class XlaAllocator : public se::DeviceMemoryAllocator {
  public:
   XlaAllocator(const se::Platform* platform, Allocator* wrapped);
   ~XlaAllocator() override;
-  xla::StatusOr<xla::OwningDeviceMemory> Allocate(
+  xla::StatusOr<se::OwningDeviceMemory> Allocate(
       int device_ordinal, uint64 size, bool retry_on_failure) override;
   Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override;
 
@@ -142,7 +142,7 @@ class XlaComputationLaunchContext {
   // because we track inter-stream dependencies through events inside XlaTensor
   // objects.
   XlaComputationLaunchContext(xla::LocalClient* client,
-                              xla::DeviceMemoryAllocator* xla_allocator,
+                              se::DeviceMemoryAllocator* xla_allocator,
                               bool allocate_xla_tensors,
                               bool use_multiple_streams);
 
@@ -186,7 +186,7 @@ class XlaComputationLaunchContext {
 
  private:
   xla::LocalClient* client_;
-  xla::DeviceMemoryAllocator* xla_allocator_;
+  se::DeviceMemoryAllocator* xla_allocator_;
   bool allocate_xla_tensors_;
   bool use_multiple_streams_;
   std::vector<std::unique_ptr<xla::ShapedBuffer>> arg_buffers_;
diff --git a/tensorflow/compiler/jit/xla_tensor.cc b/tensorflow/compiler/jit/xla_tensor.cc
index b92bd675378..1c1080f2385 100644
--- a/tensorflow/compiler/jit/xla_tensor.cc
+++ b/tensorflow/compiler/jit/xla_tensor.cc
@@ -59,7 +59,7 @@ Status XlaTensor::AllocateShapedBuffer(DataType dtype,
         xla::ShapeUtil::GetSubshape(on_device_shape, index_to_buffer.first);
     uint64 size =
         client->backend().transfer_manager()->GetByteSizeRequirement(subshape);
-    TF_ASSIGN_OR_RETURN(xla::OwningDeviceMemory buffer,
+    TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory buffer,
                         client->backend().memory_allocator()->Allocate(
                             device_ordinal, size, /*retry_on_failure=*/false));
     // Move our buffer into shaped_buffer, which takes ownership of it.
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index d7e987018ff..25756de64ca 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -458,10 +458,6 @@ tf_xla_py_test(
     name = "extract_image_patches_op_test",
     size = "small",
     srcs = ["extract_image_patches_op_test.py"],
-    tags = [
-        "manual",
-        "notap",
-    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
index 807d2effac0..e5126985ffd 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
@@ -57,7 +57,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "cuda/include/cuda_runtime_api.h"
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "tensorrt/include/NvInfer.h"
 namespace tensorflow {
 namespace tensorrt {
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index f568b947959..b0afd32fb6a 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -790,6 +790,14 @@ class TRT_TensorOrWeights::SimpleITensor : public nvinfer1::ITensor {
   float getDynamicRangeMax() const override { return 0.f; }
 #endif
 
+#if IS_TRT_VERSION_GE(6, 0, 0, 0)
+  void setAllowedFormats(nvinfer1::TensorFormats formats) override {}
+
+  nvinfer1::TensorFormats getAllowedFormats() const override { return 1; }
+
+  bool isShape() const override { return false; }
+#endif
+
  private:
   nvinfer1::DataType trt_dtype_;
   nvinfer1::Dims trt_dims_;
@@ -4455,6 +4463,40 @@ Status ConvertDepthSpaceShuffle(OpConverterParams* params) {
   return Status::OK();
 }
 
+Status ConvertSquaredDifference(OpConverterParams* params) {
+  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}, {"y", false}}));
+  TF_RETURN_IF_ERROR(
+      AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  // Broadcast inputs.
+  nvinfer1::Dims broadcasted_dims_l, broadcasted_dims_r;
+  TF_RETURN_IF_ERROR(params->converter->GetTrtBroadcastShape(
+      inputs.at(0), inputs.at(1), &broadcasted_dims_l, &broadcasted_dims_r));
+  nvinfer1::ITensor* tensor_l = nullptr;
+  nvinfer1::ITensor* tensor_r = nullptr;
+  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
+      inputs.at(0), broadcasted_dims_l, params->validation_only, &tensor_l));
+  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
+      inputs.at(1), broadcasted_dims_r, params->validation_only, &tensor_r));
+  if (params->validation_only) return Status::OK();
+
+  // Subtract x - y.
+  nvinfer1::IElementWiseLayer* sub =
+      params->converter->network()->addElementWise(
+          *tensor_l, *tensor_r, nvinfer1::ElementWiseOperation::kSUB);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(sub, node_def.name());
+  // Multiply (x - y) * (x - y).
+  nvinfer1::IElementWiseLayer* mul =
+      params->converter->network()->addElementWise(
+          *sub->getOutput(0), *sub->getOutput(0),
+          nvinfer1::ElementWiseOperation::kPROD);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(mul, node_def.name());
+
+  params->outputs->push_back(TRT_TensorOrWeights(mul->getOutput(0)));
+  return Status::OK();
+}
+
 #if IS_TRT_VERSION_GE(5, 1, 0, 0)
 Status ConvertCombinedNMS(OpConverterParams* params) {
   TF_RETURN_IF_ERROR(
@@ -4641,7 +4683,6 @@ static void RegisterValidatableOpConverters(
   (*registration)["DepthwiseConv2dNative"] = ConvertConv2DDepthwise;
   (*registration)["ExpandDims"] = ConvertExpandDims;
   (*registration)["GatherV2"] = ConvertGather;
-  (*registration)["Identity"] = ConvertIdentity;  // Identity should be removed
   (*registration)["LeakyRelu"] = ConvertLeakyRelu;
   (*registration)["MatMul"] = ConvertMatMul;
   (*registration)["Pack"] = ConvertPack;
@@ -4650,11 +4691,11 @@ static void RegisterValidatableOpConverters(
   (*registration)["Reshape"] = ConvertReshape;
   (*registration)["Rsqrt"] = ConvertRsqrt;
   (*registration)["Slice"] = ConvertSlice;
-  (*registration)["Snapshot"] = ConvertIdentity;  // Snapshot should be removed
   (*registration)["Softmax"] = ConvertSoftmax;
   (*registration)["SpaceToDepth"] = ConvertDepthSpaceShuffle;
   (*registration)["Split"] = ConvertSplit;
   (*registration)["Square"] = ConvertSquare;
+  (*registration)["SquaredDifference"] = ConvertSquaredDifference;
   (*registration)["Squeeze"] = ConvertSqueeze;
   (*registration)["StridedSlice"] = ConvertStridedSlice;
   (*registration)["TopKV2"] = ConvertTopK;
@@ -4688,6 +4729,11 @@ static void RegisterValidatableOpConverters(
   for (auto arg_minmax_type : {"ArgMin", "ArgMax"}) {
     (*registration)[arg_minmax_type] = ConvertArgMinMax;
   }
+  // The following are no-ops during inference and will not be mapped to any TRT
+  // layer.
+  for (auto identity_op_type : {"Identity", "Snapshot", "StopGradient"}) {
+    (*registration)[identity_op_type] = ConvertIdentity;
+  }
 }
 
 void TrtNodeValidator::RegisterOpValidators() {
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index ae2e91bcac2..f7322a2cab1 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -50,8 +50,8 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "cuda/include/cuda.h"
-#include "cuda/include/cuda_runtime_api.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
@@ -280,6 +280,14 @@ class FakeITensor : public nvinfer1::ITensor {
   float getDynamicRangeMax() const override { return 0.f; }
 #endif
 
+#if IS_TRT_VERSION_GE(6, 0, 0, 0)
+  void setAllowedFormats(nvinfer1::TensorFormats formats) override {}
+
+  nvinfer1::TensorFormats getAllowedFormats() const override { return 1; }
+
+  bool isShape() const override { return false; }
+#endif
+
  private:
   string name_;
   nvinfer1::Dims dims_;
@@ -5353,6 +5361,108 @@ TEST_F(OpConverterTest, ConvertClipByValue) {
 }
 #endif  // IS_TRT_VERSION_GE(5, 1, 2, 0)
 
+// Get the NodeDef for SquaredDifference.
+NodeDef GetSquaredDifferenceNodeDef(DataType dtype) {
+  Scope s = Scope::NewRootScope();
+  auto x = ops::Placeholder(s.WithOpName("x"), dtype);
+  auto y = ops::Placeholder(s.WithOpName("y"), dtype);
+  auto squared_diff =
+      ops::SquaredDifference(s.WithOpName("my_squared_diff"), x, y);
+  return squared_diff.operation.node()->def();
+}
+
+template <DataType dtype>
+void TestConvertSquaredDifference(OpConverterTest* test) {
+  typedef typename EnumToDataType<dtype>::Type CType;
+
+  struct TestParams {
+    std::vector<int> dims_x;
+    std::vector<int> dims_y;
+    std::vector<CType> value_x;
+    std::vector<CType> value_y;
+    std::vector<int> expected_output_dims;
+    std::vector<CType> expected_output;
+  };
+
+  const std::vector<CType> common_input = InitTestVector<CType>(6);
+  std::vector<TestParams> params = {
+      {
+          /*dims_x=*/{1, 2, 3},
+          /*dims_y=*/{1, 2, 3},
+          /*value_x=*/common_input,
+          /*value_y=*/CastTestVector<int, CType>({0, -1, 3, 0, 10, -7}),
+          /*expected_output_dims=*/{1, 2, 3},
+          /*expected_output=*/CastTestVector<int, CType>({0, 4, 1, 9, 36, 144}),
+      },
+      {
+          /*dims_x=*/{1, 2, 3},
+          /*dims_y=*/{1, 1, 3},
+          /*value_x=*/common_input,
+          /*value_y=*/CastTestVector<int, CType>({0, 1, 2}),
+          /*expected_output_dims=*/{1, 2, 3},
+          /*expected_output=*/CastTestVector<int, CType>({0, 0, 0, 9, 9, 9}),
+      },
+  };
+
+  for (int i = 0; i < params.size(); ++i) {
+    test->Reset();
+
+    NodeDef node_def = GetSquaredDifferenceNodeDef(dtype);
+    test->AddTestTensor("x", params[i].dims_x, 1, TfDataTypeToTrt(dtype));
+    test->AddTestTensor("y", params[i].dims_y, 1, TfDataTypeToTrt(dtype));
+    test->RunValidationAndConversion(node_def);
+
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(test->GetTensorOrWeights("my_squared_diff", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray(params[i].expected_output_dims,
+                             output.tensor()->getDimensions());
+
+    DataVec input_data{{"x", test::AsTensor<CType>(params[i].value_x)},
+                       {"y", test::AsTensor<CType>(params[i].value_y)}};
+    DataVec output_data{
+        {"my_squared_diff",
+         ConstructTensor<CType>(params[i].expected_output.size())}};
+    test->BuildAndRun(
+        input_data, &output_data,
+        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
+    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
+                ElementsAreArray(params[i].expected_output));
+  }
+}
+
+TEST_F(OpConverterTest, ConvertSquaredDifference) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_squared_diff", "SquaredDifference", {});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "SquaredDifference got 0 inputs but expected 2, at my_squared_diff");
+  }
+  {
+    // Input is a weight, should fail.
+    Reset();
+    NodeDef node_def = GetSquaredDifferenceNodeDef(DT_FLOAT);
+    AddTestWeights<float>("x", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
+    AddTestTensor("y", {1, 2, 3});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "The input \"x\" for SquaredDifference must be "
+                               "a tensor, at my_squared_diff");
+  }
+  {
+    // Shapes are not broadcastable, should fail.
+    Reset();
+    NodeDef node_def = GetSquaredDifferenceNodeDef(DT_FLOAT);
+    AddTestTensor("x", {2, 3});
+    AddTestTensor("y", {7, 5});
+    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
+                               "Infeasible broadcast scheme");
+  }
+
+  TestConvertSquaredDifference<DT_FLOAT>(this);
+  TestConvertSquaredDifference<DT_HALF>(this);
+}
+
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
index 247d4f0da0a..e84eacdd629 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
@@ -41,7 +41,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "cuda/include/cuda_runtime_api.h"
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
@@ -259,7 +259,6 @@ void TRTEngineOp::ExecuteNativeSegment(OpKernelContext* ctx,
   }
   auto lib = ctx->function_library();
   FunctionLibraryRuntime::Options opts;
-  opts.step_id = ctx->step_id();
   opts.rendezvous = ctx->rendezvous();
   opts.cancellation_manager = ctx->cancellation_manager();
   opts.runner = ctx->runner();
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
index 8d2c26ea23a..b62fdc5dc4b 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
@@ -32,7 +32,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "cuda/include/cuda_runtime_api.h"
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 
 namespace tensorflow {
 namespace tensorrt {
diff --git a/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc b/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc
index 769982c6456..7486c6db650 100644
--- a/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc
@@ -20,8 +20,8 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "cuda/include/cuda.h"
-#include "cuda/include/cuda_runtime_api.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
index a18f758a551..8d2ae49a0d0 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "cuda/include/cuda_runtime_api.h"
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #endif  // GOOGLE_TENSORRT
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
index 33a5c719ba9..51aa7be07db 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
@@ -22,7 +22,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "cuda/include/cuda_runtime_api.h"
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 
 namespace tensorflow {
 namespace tensorrt {
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
index d34e244f6c7..70bd3f609a9 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
@@ -25,7 +25,7 @@ limitations under the License.
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
 
-#include "cuda/include/cuda_runtime_api.h"
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index c8f72147e90..a2f25724b71 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -27,7 +27,10 @@ package(
     default_visibility = [":internal"],
 )
 
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
+load(
+    "//tensorflow/core:platform/default/cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
 load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library", "xla_py_proto_library")
 
 cc_library(
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 9fe25dfe3e7..89d5a860179 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -253,6 +253,7 @@ Status FunctionalizeControlFlowPass::Run(
           {"XlaLaunch", "function"},
       };
   std::map<string, absl::optional<string>> canonicalized_name_to_new_name;
+  bool fld_modified = false;
   for (Node* n : graph->nodes()) {
     auto it = kNodeTypeToFunctionAttrMapping->find(n->type_string());
     if (it == kNodeTypeToFunctionAttrMapping->end()) {
@@ -273,9 +274,16 @@ Status FunctionalizeControlFlowPass::Run(
       n->ClearAttr(func_attr);
       func.set_name(new_func_name);
       n->AddAttr(func_attr, func);
+
+      fld_modified = true;
     }
   }
 
+  if (fld_modified) {
+    TF_RETURN_IF_ERROR(
+        PruneUnreachableFunctionsFromGraph(*graph, options.flib_def));
+  }
+
   if (VLOG_IS_ON(4)) {
     DumpGraphToFile("functionalize_control_flow_after", *graph,
                     options.flib_def);
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index d8f4e1af69d..fcc1ea2575b 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -367,7 +367,7 @@ cc_library(
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
+        "//tensorflow/compiler/xla/service:custom_call_target_registry",
         "//tensorflow/core:framework_lite",
         "//third_party/eigen3",
     ],
@@ -380,7 +380,7 @@ cc_library(
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
+        "//tensorflow/compiler/xla/service:custom_call_target_registry",
         "//tensorflow/core:framework_lite",
         "//third_party/eigen3",
     ],
diff --git a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
index 29687c7b82f..d801d560040 100644
--- a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
@@ -17,7 +17,9 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
@@ -99,23 +101,22 @@ class ExtractImagePatchesOp : public XlaOpKernel {
     // The following code is equivalent to:
     // eye = np.eye(kH * kW * D).reshape([kH, kW, D, kH * kW * kD])
     int64 kernel_size = 1;
-    std::vector<int64> lhs_shape(num_dims, 1);
+    std::vector<int64> kernel_shape(num_dims, 1);
     for (int i = 0; i < num_spatial_dims; ++i) {
       int input_dim = GetTensorSpatialDimIndex(num_dims, data_format, i);
-      lhs_shape[i] = ksizes_[input_dim];
+      kernel_shape[i] = ksizes_[input_dim];
       kernel_size *= ksizes_[input_dim];
     }
-    lhs_shape[num_spatial_dims] = depth;
-    lhs_shape[num_spatial_dims + 1] = 1;
-
-    // Builds an identity matrix as a broadcast equality of iotas.
-    // iota = np.arange(np.prod(ksize), depth)
-    // filter = np.equal(np.reshape(iota, [-1, 1]), iota).astype(np.float32)
-    xla::XlaOp iota = xla::Iota(builder, xla::S32, kernel_size * depth);
-
-    auto lhs = xla::Reshape(iota, lhs_shape);
-    auto filter = xla::ConvertElementType(
-        xla::Eq(lhs, iota, {num_spatial_dims + 1}), type);
+    kernel_shape[num_spatial_dims] = 1;
+    kernel_shape[num_spatial_dims + 1] = kernel_size * depth;
+    xla::Shape iota_kernel_shape =
+        xla::ShapeUtil::MakeShape(xla::S32, {kernel_size, depth, kernel_size});
+    xla::XlaOp filter =
+        xla::Reshape(xla::ConvertElementType(
+                         xla::Eq(xla::Iota(builder, iota_kernel_shape, 0),
+                                 xla::Iota(builder, iota_kernel_shape, 2)),
+                         type),
+                     kernel_shape);
 
     xla::ConvolutionDimensionNumbers dims;
     std::vector<int64> window_strides(num_spatial_dims);
@@ -148,7 +149,7 @@ class ExtractImagePatchesOp : public XlaOpKernel {
 
     xla::XlaOp conv =
         xla::ConvGeneralDilated(ctx->Input(0), filter, window_strides, padding,
-                                lhs_dilation, rhs_dilation, dims);
+                                lhs_dilation, rhs_dilation, dims, depth);
     ctx->SetOutput(0, conv);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc b/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc
index 39d96e748b3..19ec222e2e8 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
+#include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/platform/dynamic_annotations.h"
 #include "tensorflow/core/platform/macros.h"
@@ -46,4 +46,4 @@ extern "C" void TF_EXPORT argmax_float_1d_xla_impl(void* out, void** data) {
   tensorflow::argmax_float_1d_xla_impl(out, data);
 }
 
-REGISTER_CUSTOM_CALL_TARGET(argmax_float_1d_xla_impl);
+XLA_CPU_REGISTER_CUSTOM_CALL_TARGET(argmax_float_1d_xla_impl);
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_2d.cc b/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_2d.cc
index 9b83392d8fb..6e1c1226321 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_2d.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_2d.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
+#include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/platform/dynamic_annotations.h"
 #include "tensorflow/core/platform/macros.h"
@@ -51,4 +51,4 @@ extern "C" void TF_EXPORT argmax_float_2d_xla_impl(void* out, void** data) {
   tensorflow::argmax_float_2d_xla_impl(out, data);
 }
 
-REGISTER_CUSTOM_CALL_TARGET(argmax_float_2d_xla_impl);
+XLA_CPU_REGISTER_CUSTOM_CALL_TARGET(argmax_float_2d_xla_impl);
diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
index 8716484a3c1..507bc8d7a3b 100644
--- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 // XLA specific pooling ops.
 
+#include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
@@ -327,6 +328,20 @@ class MaxPoolGradOp : public XlaOpKernel {
     xla::Padding xla_padding =
         (padding_ == VALID) ? xla::Padding::kValid : xla::Padding::kSame;
 
+    // Create a MaxPool operation to check the expected resulting shape, and
+    // then throw away the operation because we don't actually neeed it here.
+    TensorShape expected_out_shape;
+    auto pooling =
+        xla::MaxPool(ctx->Input(0), ksize_, stride_, xla_padding,
+                     XlaTensorFormat(data_format_, tensor_in_shape.dims() - 2));
+    auto status_or_shape = pooling.builder()->GetShape(pooling);
+    OP_REQUIRES_OK(ctx, status_or_shape.status());
+    OP_REQUIRES_OK(ctx, XLAShapeToTensorShape(status_or_shape.ValueOrDie(),
+                                              &expected_out_shape));
+    OP_REQUIRES(ctx, expected_out_shape == out_backprop_shape,
+                errors::Unimplemented("The output dimensions do not match the "
+                                      "other input values."));
+
     xla::PrimitiveType element_type;
     OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(input_type(2), &element_type));
     xla::XlaOp init_value = XlaHelpers::Zero(ctx->builder(), input_type(2));
diff --git a/tensorflow/compiler/tf2xla/rearrange_function_argument_pass.cc b/tensorflow/compiler/tf2xla/rearrange_function_argument_pass.cc
index 23dd3261f38..75fddf3913d 100644
--- a/tensorflow/compiler/tf2xla/rearrange_function_argument_pass.cc
+++ b/tensorflow/compiler/tf2xla/rearrange_function_argument_pass.cc
@@ -733,6 +733,7 @@ Status RearrangeFunctionArgumentPass::Run(
           {"XlaLaunch", "function"},
       };
   std::map<string, absl::optional<string>> canonicalized_name_to_new_name;
+  bool fld_modified = false;
   for (Node* n : graph->nodes()) {
     auto it = kNodeTypeToFunctionAttrMapping->find(n->type_string());
     if (it == kNodeTypeToFunctionAttrMapping->end()) {
@@ -753,8 +754,14 @@ Status RearrangeFunctionArgumentPass::Run(
       n->ClearAttr(func_attr);
       func.set_name(new_func_name);
       n->AddAttr(func_attr, func);
+
+      fld_modified = true;
     }
   }
+  if (fld_modified) {
+    TF_RETURN_IF_ERROR(
+        PruneUnreachableFunctionsFromGraph(**options.graph, options.flib_def));
+  }
 
   if (VLOG_IS_ON(4)) {
     DumpGraphToFile("rearrange_function_argument_after", *graph,
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.cc b/tensorflow/compiler/tf2xla/tf2xla_util.cc
index 6cd81f2f262..1aee747476c 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.cc
@@ -773,4 +773,17 @@ Status PropagateConstIntoFunctionalNodes(
   return Status::OK();
 }
 
+Status PruneUnreachableFunctionsFromGraph(const Graph& g,
+                                          FunctionLibraryDefinition* fld) {
+  GraphDef graph_def;
+  g.ToGraphDef(&graph_def);
+  FunctionLibraryDefinition reachable_functions =
+      fld->ReachableDefinitions(graph_def);
+  for (const string& func_name : fld->ListFunctionNames()) {
+    if (!reachable_functions.Find(func_name)) {
+      TF_RETURN_IF_ERROR(fld->RemoveFunction(func_name));
+    }
+  }
+  return Status::OK();
+}
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.h b/tensorflow/compiler/tf2xla/tf2xla_util.h
index cf3aa2f847c..0b78631fd24 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.h
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/kernel_def.pb.h"
 #include "tensorflow/core/framework/op.h"
@@ -197,6 +198,10 @@ Status PropagateConstIntoFunctionalNodes(
     Graph* g, const FunctionLibraryDefinition* lookup_fld,
     FunctionLibraryDefinition* fld);
 
+// Prunes unreachable FunctionDefs from FunctionLibraryDefinition.
+Status PruneUnreachableFunctionsFromGraph(const Graph& g,
+                                          FunctionLibraryDefinition* fld);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_TF2XLA_TF2XLA_UTIL_H_
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.cc b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
index 403e579d1c4..c14519c3ade 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.cc
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
@@ -58,18 +58,13 @@ class XlaCompilationAllocator : public Allocator {
 
   // Make sure that even tensors with 0 elements have allocated
   // buffers, so they get ids to track.
-  bool ShouldAllocateEmptyTensors() const override { return true; }
-
- private:
-  // Don't run any constructors or destructors for complex objects,
-  // since there is no backing store for the tensor to run them
-  // on. strings are the only complex objects currently stored in
-  // Tensors. If others are added, this set of overrides must be
-  // extended to include them.
-  void RunStringCtor(string* p, size_t n) override {}
-  void RunStringDtor(string* p, size_t n) override {}
-  void RunResourceCtor(ResourceHandle* p, size_t n) override {}
-  void RunResourceDtor(ResourceHandle* p, size_t n) override {}
+  //
+  // NOTE: It is the caller's responsibility to track whether an allocated
+  // object is a buffer or an opaque handle. In particular, when this allocator
+  // is used, the caller must not run any constructors or destructors for
+  // complex objects, since there is no backing store for the tensor in which to
+  // place their outputs.
+  bool AllocatesOpaqueHandle() const override { return true; }
 };
 
 XlaCompilationDevice::XlaCompilationDevice(const SessionOptions& options,
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 406d5ba197b..1cc5d8d4728 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -339,7 +339,7 @@ class XlaCompiler {
     // here, but on some devices (notably, GPUs), TensorFlow tends to eagerly
     // allocate most or all available memory on the device, leaving none for the
     // compiler to access, unless it can use TensorFlow's allocator.
-    xla::DeviceMemoryAllocator* device_allocator = nullptr;
+    se::DeviceMemoryAllocator* device_allocator = nullptr;
   };
 
   explicit XlaCompiler(Options options);
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h
index 3e9dc99a690..95d1bf25150 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.h
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.h
@@ -116,10 +116,9 @@ class XlaOpRegistry {
     // If we should cluster operations returning DT_VARIANT.
     bool cluster_variant_ops = false;
 
-    // If we should cluster the "Svd" op.  The XLA implemenation of this op has
-    // some performance (b/128001705) and possibly correctness (b/127344411)
-    // issues so we avoid auto-clustering it for non XLA_* devices.
-    bool cluster_svd_op = false;
+    // Whether ops known to be slow or to have correctness issues should be
+    // auto-clustered.
+    bool cluster_slow_and_inaccurate_ops = false;
   };
 
   // Registers an XLA backend. `compilation_device_name` is the name of the
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index d5ade8f6262..b800229bd90 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -96,7 +96,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla:xla_proto",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
@@ -117,7 +117,6 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/compiler/xla/service:compiler",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:dump",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:hlo_proto",
@@ -126,6 +125,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:source_map_util",
         "//tensorflow/compiler/xla/service:stream_pool",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:span",
         "@llvm//:support",
@@ -165,11 +165,11 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/compiler/xla/service:compile_only_service",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:optional",
     ],
diff --git a/tensorflow/compiler/xla/client/client_library.h b/tensorflow/compiler/xla/client/client_library.h
index 62d225c6c29..33d1de370de 100644
--- a/tensorflow/compiler/xla/client/client_library.h
+++ b/tensorflow/compiler/xla/client/client_library.h
@@ -31,7 +31,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/compile_only_client.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/service/compile_only_service.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -39,6 +38,7 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/client/executable_build_options.cc b/tensorflow/compiler/xla/client/executable_build_options.cc
index f2d124d099b..d5de53a7941 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.cc
+++ b/tensorflow/compiler/xla/client/executable_build_options.cc
@@ -22,12 +22,12 @@ limitations under the License.
 namespace xla {
 
 ExecutableBuildOptions& ExecutableBuildOptions::set_device_allocator(
-    DeviceMemoryAllocator* allocator) {
+    se::DeviceMemoryAllocator* allocator) {
   device_allocator_ = allocator;
   return *this;
 }
 
-DeviceMemoryAllocator* ExecutableBuildOptions::device_allocator() const {
+se::DeviceMemoryAllocator* ExecutableBuildOptions::device_allocator() const {
   return device_allocator_;
 }
 
diff --git a/tensorflow/compiler/xla/client/executable_build_options.h b/tensorflow/compiler/xla/client/executable_build_options.h
index 1d85fb34304..e2e231981bf 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.h
+++ b/tensorflow/compiler/xla/client/executable_build_options.h
@@ -18,11 +18,11 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 
@@ -57,11 +57,11 @@ class ExecutableBuildOptions {
   // want to run various algorithms on the device and pick the fastest one -- it
   // might allocate buffers for use by these algorithms using this allocator.
   //
-  // This does not need to be the same as the DeviceMemoryAllocator passed when
-  // running the executable.
+  // This does not need to be the same as the se::DeviceMemoryAllocator passed
+  // when running the executable.
   ExecutableBuildOptions& set_device_allocator(
-      DeviceMemoryAllocator* allocator);
-  DeviceMemoryAllocator* device_allocator() const;
+      se::DeviceMemoryAllocator* allocator);
+  se::DeviceMemoryAllocator* device_allocator() const;
 
   // Returns a string representation of the build options, suitable for
   // debugging.
@@ -77,7 +77,7 @@ class ExecutableBuildOptions {
   Shape result_layout_;
   bool result_layout_set_ = false;
   absl::optional<DebugOptions> debug_options_;
-  DeviceMemoryAllocator* device_allocator_ = nullptr;
+  se::DeviceMemoryAllocator* device_allocator_ = nullptr;
   int num_replicas_ = 1;
 };
 
diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc
index ee70adc6e7d..3d15101ea66 100644
--- a/tensorflow/compiler/xla/client/lib/math.cc
+++ b/tensorflow/compiler/xla/client/lib/math.cc
@@ -528,7 +528,9 @@ XlaOp Asin(XlaOp x) {
 
 XlaOp Atan(XlaOp x) { return Atan2(x, ScalarLike(x, 1.0)); }
 
-XlaOp Tan(XlaOp x) { return Sin(x) / Cos(x); }
+XlaOp Tan(XlaOp x) {
+  return DoWithUpcastToF32(x, {F16}, [](XlaOp x) { return Sin(x) / Cos(x); });
+}
 
 // Hyperbolic trigonometric functions.
 
@@ -574,9 +576,9 @@ XlaOp Acosh(XlaOp x) {
 // If x^2 will overflow and x is positive, we can approximate x + sqrt(x^2 + 1)
 // as 2*x and return log(2) + log(x).
 //
-// If x is negative, the above would give us some trouble, because we'd need to
-// approximate x + sqrt(sqrt(x^2 + 1) - abs(x).  But we're saved
-// by the fact that asinh(-x) = -asinh(x).
+// If x is negative, the above would give us some trouble; we can't approximate
+// the result as x + abs(x) = 0!  But we're saved by the fact that asinh(-x) =
+// -asinh(x).
 XlaOp Asinh(XlaOp x) {
   XlaBuilder* b = x.builder();
   auto do_it = [&](XlaOp x) -> StatusOr<XlaOp> {
@@ -636,9 +638,39 @@ XlaOp Atanh(XlaOp x) {
   });
 }
 
-XlaOp Cosh(XlaOp x) { return (Exp(x) + Exp(-x)) * ScalarLike(x, 0.5); }
+// Cosh(x) = (e^x + e^-x) / 2
+//         = e^(x + log(1/2)) + e^(-x + log(1/2)).
+//
+// The second formulation avoids overflowing when e^x = inf but (e^x)/2 is not
+// inf.
+//
+// This incorrectly overflows to inf for two f32 input values, namely
+// +/-89.4159851, due to rounding error when computing x +/- log(1/2).  The
+// correct answer of 3.40281961e+38 (0x7f7fffec) is very close to max-float, so
+// we deem this acceptable.
+XlaOp Cosh(XlaOp x) {
+  return DoWithUpcastToF32(x, {BF16, F16}, [](XlaOp x) {
+    auto log_one_half = Log(ScalarLike(x, 0.5));
+    return Exp(x + log_one_half) + Exp(-x + log_one_half);
+  });
+}
 
-XlaOp Sinh(XlaOp x) { return (Exp(x) - Exp(-x)) * ScalarLike(x, 0.5); }
+// Sinh(x) = (e^x - e^-x) / 2
+//         = e^(x + log(1/2)) - e^(-x + log(1/2)).
+//
+// The second formulation avoids overflowing when e^x = inf but (e^x)/2 is not
+// inf.
+//
+// This incorrectly overflows to +/-inf for two f32 input values, namely
+// +/-89.4159851, due to rounding error when computing x +/- log(1/2).  The
+// correct answer of 3.40281961e+38 (0x7f7fffec) is very close to max-float, so
+// we deem this acceptable.
+XlaOp Sinh(XlaOp x) {
+  return DoWithUpcastToF32(x, {BF16, F16}, [](XlaOp x) {
+    auto log_one_half = Log(ScalarLike(x, 0.5));
+    return Exp(x + log_one_half) - Exp(-x + log_one_half);
+  });
+}
 
 XlaOp MaybeConjugate(XlaOp x, bool conjugate) {
   XlaBuilder* builder = x.builder();
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index 192785646ec..1bd9d7b7228 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -279,7 +279,7 @@ StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Compile(
 
 StatusOr<ScopedShapedBuffer> LocalClient::LiteralToShapedBuffer(
     const LiteralSlice& literal, int device_ordinal,
-    DeviceMemoryAllocator* allocator) {
+    se::DeviceMemoryAllocator* allocator) {
   if (allocator == nullptr) {
     allocator = backend().memory_allocator();
   }
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index 7f4a3db10b8..1e7c97d6f06 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
@@ -32,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 
@@ -137,7 +137,7 @@ class LocalClient : public Client {
   // device is used.
   StatusOr<ScopedShapedBuffer> LiteralToShapedBuffer(
       const LiteralSlice& literal, int device_ordinal,
-      DeviceMemoryAllocator* allocator = nullptr);
+      se::DeviceMemoryAllocator* allocator = nullptr);
 
   // Transfer the BorrowingLiteral to the device with the given ordinal.
   StatusOr<TransferToServerResponse> TransferToLocalServer(
diff --git a/tensorflow/compiler/xla/executable_run_options.cc b/tensorflow/compiler/xla/executable_run_options.cc
index 230f3b202a4..39c90b60a09 100644
--- a/tensorflow/compiler/xla/executable_run_options.cc
+++ b/tensorflow/compiler/xla/executable_run_options.cc
@@ -26,12 +26,13 @@ ExecutableRunOptions& ExecutableRunOptions::set_device_ordinal(
 int ExecutableRunOptions::device_ordinal() const { return device_ordinal_; }
 
 ExecutableRunOptions& ExecutableRunOptions::set_allocator(
-    DeviceMemoryAllocator* allocator) {
+    stream_executor::DeviceMemoryAllocator* allocator) {
   allocator_ = allocator;
   return *this;
 }
 
-DeviceMemoryAllocator* ExecutableRunOptions::allocator() const {
+stream_executor::DeviceMemoryAllocator* ExecutableRunOptions::allocator()
+    const {
   return allocator_;
 }
 
diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h
index 1ac26a0fb40..84629593953 100644
--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@@ -23,6 +23,7 @@ limitations under the License.
 namespace stream_executor {
 class Stream;
 class Platform;
+class DeviceMemoryAllocator;
 }  // namespace stream_executor
 
 namespace Eigen {
@@ -31,7 +32,6 @@ struct ThreadPoolDevice;
 
 namespace xla {
 
-class DeviceMemoryAllocator;
 class DeviceAssignment;
 class ExecutionProfile;
 
@@ -39,8 +39,9 @@ class ExecutionProfile;
 class ExecutableRunOptions {
  public:
   // Specifies the allocator to use during execution.
-  ExecutableRunOptions& set_allocator(DeviceMemoryAllocator* allocator);
-  DeviceMemoryAllocator* allocator() const;
+  ExecutableRunOptions& set_allocator(
+      stream_executor::DeviceMemoryAllocator* allocator);
+  stream_executor::DeviceMemoryAllocator* allocator() const;
 
   // If set, this is the device to run the computation on. Valid device_ordinal
   // values are: 0 to # of devices - 1. These values are identical to the device
@@ -87,7 +88,7 @@ class ExecutableRunOptions {
   int rng_seed() const;
 
  private:
-  DeviceMemoryAllocator* allocator_ = nullptr;
+  stream_executor::DeviceMemoryAllocator* allocator_ = nullptr;
   int device_ordinal_ = -1;
   const DeviceAssignment* device_assignment_ = nullptr;
   stream_executor::Stream* stream_ = nullptr;
diff --git a/tensorflow/compiler/xla/g3doc/_book.yaml b/tensorflow/compiler/xla/g3doc/_book.yaml
index d756cd74c98..dafc3345555 100644
--- a/tensorflow/compiler/xla/g3doc/_book.yaml
+++ b/tensorflow/compiler/xla/g3doc/_book.yaml
@@ -29,6 +29,8 @@ upper_tabs:
         path: /xla/tiled_layout
       - title: Using AOT compilation
         path: /xla/tfcompile
+      - title: Writing custom calls
+        path: /xla/custom_call
       - heading: Tutorials
       - title: XLA compile API
         path: /xla/tutorials/xla_compile
diff --git a/tensorflow/compiler/xla/g3doc/custom_call.md b/tensorflow/compiler/xla/g3doc/custom_call.md
new file mode 100644
index 00000000000..acc2c9a92f5
--- /dev/null
+++ b/tensorflow/compiler/xla/g3doc/custom_call.md
@@ -0,0 +1,329 @@
+# XLA Custom Calls
+
+This document describes how to write and use XLA "custom calls". Custom calls
+let you invoke code written in a programming language like C++ or CUDA from an
+XLA program.
+
+Warning: Custom calls are a low-level power-user feature. It is easy to break
+your program in difficult-to-debug (and even difficult-to-notice) ways using
+custom-calls. You shouldn't use custom calls unless you're prepared to debug XLA
+yourself when something goes wrong, and you should expect relatively less
+assistance from XLA developers if you run into trouble.
+
+Warning: The custom-call API/ABI is not currently stable. We don't intend to
+change it capriciously, but it may change. Some possible future changes are
+described below.
+
+## Custom-call on CPU
+
+You can create an HLO instruction which represents a custom-call via XLA's
+client API. This is not exposed via TensorFlow as of writing.
+
+For example, the following code uses a custom-call to compute
+`A[i] = B[i % 128] + C[i]` on the CPU. (Of course you could -- and should! -- do
+this with regular HLO.)
+
+```c++
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
+
+void do_it() {
+  xla::XlaBuilder b("do_it");
+  xla::XlaOp param0 =
+      xla::Parameter(0, xla::ShapeUtil::CreateShape(F32, {128}), "p0");
+  xla::XlaOp param1 =
+      xla::Parameter(1, xla::ShapeUtil::CreateShape(F32, {2048}), "p1");
+  xla::XlaOp custom_call =
+      xla::CustomCall(&b, "do_custom_call", /*operands=*/{param0, param1},
+                      /*output_shape=*/ShapeUtil::CreateShape(F32, {2048}));
+}
+
+void do_custom_call(void* out, const void** in) {
+  float* out_buf = reinterpret_cast<float*>(out);
+  const float* in0 = reinterpret_cast<const float*>(in[0]);
+  const float* in1 = reinterpret_cast<const float*>(in[1]);
+  for (int i = 0; i < 2048; ++i) {
+    out_buf[i] = in0[i % 128] + in1[i];
+  }
+}
+XLA_REGISTER_CUSTOM_CALL_TARGET(do_custom_call, "Host");
+```
+
+Notice that the function `do_custom_call` needs to know the dimensions of the
+buffers it operates over. In this example we hardcode the sizes 128 and 2048. If
+you don't want to do this, you can pass the dimensions in as parameters to the
+call.
+
+## Custom-call on GPU
+
+The GPU custom call framework is somewhat different than that on the CPU. Here
+is a CUDA example that does the same `A[i] = B[i % 128] + C[i]` computation as
+the CPU code above.
+
+```c++
+void do_it() { /* same implementation as above */ }
+
+__global__ custom_call_kernel(const float* in0, const float* in1, float* out) {
+  size_t idx = threadIdx.x * blockSize.x + gridIdx.x;
+  out[idx] = in0[idx % 128] + in1[idx];
+}
+
+void do_custom_call(CUstream stream, void** buffers,
+                    const char* opaque, size_t opaque_len) {
+  const float* in0 = reinterpret_cast<const float*>(buffers[0]);
+  const float* in1 = reinterpret_cast<const float*>(buffers[1]);
+  float* out = reinterpret_cast<float*>(buffers[2]);
+
+  const int64 block_dim = 64;
+  const int64 grid_dim = 2048 / block_dim;
+  custom_call_kernel<<<grid_dim, block_dim,
+                       /*dynamic_shared_mem_bytes=*/0, stream>>>(in0, in1, out);
+}
+XLA_REGISTER_CUSTOM_CALL_TARGET(do_custom_call, "CUDA");
+```
+
+Notice first that the GPU custom call function *is still a function executed on
+the CPU*. Our `do_custom_call` CPU function is responsible for enqueueing work
+on the GPU. Here it launches a CUDA kernel, but it could also do something else,
+like call cublas.
+
+`buffers` is an array of pointers which lives on the host, and each element it
+contains points to device (i.e. GPU) memory. The parameters come first, followed
+by the output value. This is notably different from the CPU calling convention,
+which has two params, `ins` and `out`. The main reason we diverge is to make it
+possible to handle tuple-shaped inputs/outputs efficiently; see the section
+below.
+
+As in the CPU example, we've hardcoded the input and output buffer sizes into
+our custom call. However unlike in the CPU case, passing the buffer sizes in as
+operands to the custom call would not work well. Usually we need the buffer
+sizes available to us on the CPU; e.g. when launching a kernel, we need to know
+the block/grid dimensions to use. But if we were to pass the buffer sizes as
+operands to our custom call, their values would live in GPU memory. We'd then
+have to do an expensive synchronous device-to-host memcpy at the start of our
+operation just to read the sizes.
+
+To let you work around this, we provide the `opaque` parameter. You can set this
+to an arbitrary string of bytes when you create the custom call:
+
+```c++
+std::string opaque = "...";
+xla::CustomCall(&b, "do_custom_call", /*operands=*/{param0, param1},
+                /*output_shape=*/ShapeUtil::CreateShape(F32, {2048}),
+                opaque);
+```
+
+Since `xla::Shape` has a protocol buffer representation, you could store this
+serialized proto inside of `opaque` and deserialize it within your GPU
+custom-call. Note however that although `xla::ShapeProto` does not change
+frequently, it *does* change. Check the git log to see how it has changed in the
+past.
+
+## Passing tuples to custom-calls
+
+Consider the following custom-call.
+
+```c++
+using xla::ShapeUtil;
+Shape p0_shape = ShapeUtil::MakeTuple({
+    ShapeUtil::MakeShape(F32, {32}),
+    ShapeUtil::MakeTuple({
+        ShapeUtil::MakeTuple(F32, {64}),
+        ShapeUtil::MakeTuple(F32, {128}),
+    }),
+    ShapeUtil::MakeShape(F32, {256}),
+});
+xla::XlaOp p0 = xla::Parameter(0, p0_shape, "p0");
+
+Shape out_shape = ShapeUtil::MakeTuple({
+  ShapeUtil::MakeShape(F32, {512}),
+  ShapeUtil::MakeShape(F32, {1024}),
+});
+xla::CustomCall(&b, "do_custom_call", /*operands=*/{p0}, out_shape);
+```
+
+On both CPU and GPU, a tuple is represented in memory as an array of pointers.
+In C++-pseudocode, parameter 0 above is laid out as follows.
+
+```c++
+// In-memory layout of parameter 0 from custom-call above.  True on both CPU
+// and GPU.
+float* subbuf0 = new float[32];
+float* subbuf1 = new float[64];
+float* subbuf2 = new float[128]
+float* subbuf3 = new float[256];
+
+void* subtuple = new void*[2];
+(*subtuple)[0] = subbuf1;
+(*subtuple)[1] = subbuf2;
+
+void* p0 = new void*[3];
+(*p0)[0] = subbuf0;
+(*p0)[1] = subtuple;
+(*p0)[2] = subbuf3;
+```
+
+Although the in-memory representation of tuples is the same in CPU and GPU, they
+are handled differently in the CPU and GPU custom-call calling conventions.
+
+### Tuple outputs as temp buffers
+
+Tuple inputs to custom-calls are a convenience, but they aren't strictly
+necessary. If we didn't support tuple inputs to custom calls, you could always
+unpack the tuples using get-tuple-element before passing them to the custom
+call.
+
+On the other hand, tuple *outputs* do let you do things you couldn't otherwise.
+
+The obvious reason to have tuple outputs is, that's how a custom call (or any
+other XLA op) returns multiple independent arrays.
+
+But less obviously, a tuple output is also a way to give your custom call temp
+memory. Yes, an *output* can represent a temp buffer. Consider, an output buffer
+has the property that the op can write to it, and it can read from it after it's
+been written to. That's exactly what you want from a temp buffer.
+
+In the example above, suppose we wanted to use the `F32[1024]` as a temp buffer.
+Then we'd write the HLO just as above, and we'd simply never read tuple index 1
+of the custom call's output.
+
+### Tuples in CPU custom-calls
+
+In CPU code, we have a function `do_custom_call(const void** ins, void* out)`.
+`ins` is an array with just one element, which points to `param0`. The
+subbuffers of `param0` are accessible by dereferencing that pointer, and the
+subbuffers of `output_tuple` are accessible by dereferencing `out`.
+
+### Tuples in GPU custom-calls
+
+In GPU code, we have a function `do_custom_call(..., void** buffers, ...)`. In
+this case `buffers` is a host array of *nine* device pointers, one for each
+nested buffer. To generate the flat list, we iterate over the parameters and
+output, and then do preorder traversal of their shapes. Concretely:
+
+```c++
+// Layout of `buffers` parameter to GPU custom call function for custom-call
+// above.
+buffers[0] == param0
+buffers[1] == subbuf0 or null
+buffers[2] == subtuple or null
+buffers[3] == subbuf1 or null
+buffers[4] == subbuf2 or null
+buffers[5] == subbuf3 or null
+buffers[6] == output_tuple
+buffers[7] == output_subbuf0
+buffers[8] == output_subbuf1
+```
+
+The `or null` part is significant. A sub-buffer of an input tuple will be
+non-null in the `buffers` list if XLA is able to statically analyze the program
+and figure out the address of the sub-buffer. This is usually the case, but may
+not be in programs with control flow and/or `select` ops over tuples.
+
+A correct custom-call implementation that accepts a tuple as input must always
+handle null input sub-buffers, by dereferencing the root tuple.
+
+The rule is reversed for output buffers. The output sub-buffers will always be
+populated, but it's up to the custom call to populate the root tuple at the end.
+
+See the following code.  Note that we leave out CUDA error handling for clarity,
+but you'll be thankful if you do it, because otherwise it can be hard to tell
+when a stream encounters an error.
+
+```c++
+void do_custom_call(CUstream stream, void** buffers, const char* opaque,
+                    size_t opaque_len) {
+  bool needs_sync = false;
+  const float* subbuf0 = reinterpret_cast<const float*>(buffers[1]);
+  if (subbuf0 == nullptr) {
+    needs_sync = true;
+    cudaMemcpyAsync(&subbuf0, buffers[0], sizeof(void*),
+                    cudaMemcpyDeviceToHost, stream);
+  }
+  const void** subtuple = reinterpret_cast<const void**>(buffers[2]);
+  if (subtuple == nullptr) {
+    needs_sync = true;
+    cudaMemcpyAsync(&subtuple, buffers[2], ...);
+  }
+
+  // ... similarly for other params ...
+
+  // Wait for copies enqueued above to complete.
+  if (needs_sync) {
+    cudaStreamSynchronize(stream);
+  }
+  needs_sync = false;
+
+  // Now that we have `subtuple`, we can get subbuf1 and subbuf2.
+  float* subbuf1 = buffers[3];
+  if (subbuf1 == nullptr) {
+    needs_sync = true;
+    cudaMemcpyAsync(&subbuf1, subtuple, ...);
+  }
+  float* subbuf2 = buffers[4];
+  if (subbuf2 == nullptr) {
+    needs_sync = true;
+    cudaMemcpyAsync(&subbuf2, subtuple + 1, ...);
+  }
+
+  // Wait for copies enqueued above to complete.
+  if (needs_sync) {
+    cudaStreamSynchronize(stream);
+  }
+
+  // ... actually run the kernel ...
+
+  // Fill the output tuple.
+  void* outputs[2] = {buffers[7], buffers[8]};
+  cudaMemcpyAsync(buffers[6], outputs, sizeof(outputs), cudaMemcpyHostToDevice,
+                  stream);
+
+  // Necessary to force the cudaMemcpyAsync above to complete before `outputs`
+  // goes out of scope.  A sync is only necessary in the tuple output case, and
+  // see below for a way to avoid this.
+  cudaStreamSynchronize(stream);
+}
+```
+
+The `cudaStreamSynchronize` at the end of the function is unfortunate, as it's
+not required in the non-tuple-output case, and it can be expensive.  One way to
+get around this would be to make `outputs` into a global variable and ensure
+that the previous cudaMemcpyAsync completed before overwriting the global and
+enqueueing another one.  This is sketched below.
+
+```
+void do_custom_call(CUstream stream, void** buffers, const char* opaque,
+                    size_t opaque_len) {
+
+  // ... Beginning of function is the same as above ...
+
+  // ... actually run the kernel ...
+
+  static std::atomic<bool> first_time{true};
+  static CUevent event;
+  static void* outputs[2];
+  if (first_time.fetch_and(false)) {
+    // First time running this function.  Initialize `event`.
+    cuEventCreate(&event, CU_EVENT_DISABLE_TIMING);
+  } else {
+    // Not first time running this function.  Wait for previous event to
+    // complete before touching `outputs`.
+    cuEventSynchronize(event);
+  }
+
+  // Fill the output tuple.
+  outputs[0] = buffers[7];
+  outputs[1] = buffers[8];
+  cudaMemcpyAsync(buffers[6], outputs, sizeof(outputs), cudaMemcpyHostToDevice,
+                  stream);
+
+  // Unblock `event` after the memcpy completes.
+  cuEventRecord(event, stream);
+}
+```
+
+This simple implementation would limit parallelism if you want to run this op on
+multiple GPUs concurrently (or on one GPU with multiple streams); in that case
+you might need multiple events and globals.  We have seen one implementation of
+this algorithm which keeps a pool of globals and events and periodically polls
+them (perhaps on each call to the op) to garbage collect.
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index 339f8f004fa..1738d47f3f1 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -67,8 +67,8 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/core:lib",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/types:optional",
         "@pybind11",
@@ -109,9 +109,9 @@ cc_library(
     hdrs = ["shared_device_buffer.h"],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/service:transfer_manager",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
@@ -131,11 +131,50 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "local_client",
+    srcs = ["local_client.cc"],
+    hdrs = ["local_client.h"],
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+        "-Wno-c++98-c++11-compat",
+    ],
+    features = ["-use_header_modules"],
+    deps = [
+        ":shared_device_buffer",
+        ":types",
+        ":worker_thread",
+        "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/client:executable_build_options",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/service:computation_placer",
+        "//tensorflow/compiler/xla/service:custom_call_target_registry",
+        "//tensorflow/compiler/xla/service:platform_util",
+        "//tensorflow/compiler/xla/service:shaped_buffer",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/lib:traceme",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
+        "@pybind11",
+    ],
+)
+
 tf_pybind_extension(
     name = "xla_extension",
     srcs = [
-        "local_client.cc",
-        "local_client.h",
         "xla.cc",
     ],
     copts = [
@@ -146,22 +185,19 @@ tf_pybind_extension(
     features = ["-use_header_modules"],
     module_name = "xla_extension",
     deps = [
+        ":local_client",
         ":shared_device_buffer",
         ":types",
         ":worker_thread",
         ":xrt",
+        "@com_google_absl//absl/hash",
         "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
         "@pybind11",
         "//third_party/python_runtime:headers",  # buildcleaner: keep
-        "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
@@ -178,7 +214,7 @@ tf_pybind_extension(
         "//tensorflow/compiler/xla/client/lib:self_adjoint_eig",
         "//tensorflow/compiler/xla/client/lib:svd",
         "//tensorflow/compiler/xla/service:computation_placer",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_graph_dumper",
         "//tensorflow/compiler/xla/service:name_uniquer",
@@ -186,9 +222,7 @@ tf_pybind_extension(
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/compiler/xla/service:cpu_plugin",
-        "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
         "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/lib:traceme",
         # Do NOT remove this dependency. The XLA Python extension must not
         # depend on any part of TensorFlow at runtime, **including**
         # libtensorflow_framework.so. The XLA module is deployed self-contained
diff --git a/tensorflow/compiler/xla/python/local_client.cc b/tensorflow/compiler/xla/python/local_client.cc
index fe5142f40a1..66dbb6964eb 100644
--- a/tensorflow/compiler/xla/python/local_client.cc
+++ b/tensorflow/compiler/xla/python/local_client.cc
@@ -78,7 +78,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/python/shared_device_buffer.h"
 #include "tensorflow/compiler/xla/python/types.h"
-#include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
+#include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -101,8 +101,8 @@ Status RegisterCpuCustomCallTarget(const std::string& fn_name,
         "Argument to RegisterCpuCustomCallTargetRegistry was not a "
         "xla._CPU_CUSTOM_CALL_TARGET capsule.");
   }
-  cpu::CustomCallTargetRegistry::Global()->Register(
-      std::string(fn_name.begin(), fn_name.end()), static_cast<void*>(capsule));
+  CustomCallTargetRegistry::Global()->Register(
+      fn_name, static_cast<void*>(capsule), "Host");
   return Status::OK();
 }
 
@@ -147,7 +147,12 @@ Device::Device(se::StreamExecutor* executor, bool use_multiple_streams,
                                                    "py_xla_execute");
 }
 
-Device::~Device() { compute_stream_->parent()->SynchronizeAllActivity(); }
+Device::~Device() {
+  bool ok = compute_stream_->parent()->SynchronizeAllActivity();
+  if (!ok) {
+    LOG(ERROR) << "SynchronizeAllActivity failed when destroying Device.";
+  }
+}
 
 void Device::ThenExecuteOnWorkerThread(se::Stream* stream,
                                        std::function<void()> callback) const {
@@ -155,7 +160,7 @@ void Device::ThenExecuteOnWorkerThread(se::Stream* stream,
       [this, callback]() { worker_thread_->Schedule(std::move(callback)); });
 }
 
-StatusOr<std::unique_ptr<PyLocalClient>> PyLocalClient::Get(
+StatusOr<std::shared_ptr<PyLocalClient>> PyLocalClient::Get(
     const std::string& platform_name, const std::string& xla_platform_name,
     bool asynchronous) {
   TF_ASSIGN_OR_RETURN(se::Platform * platform,
@@ -168,7 +173,7 @@ StatusOr<std::unique_ptr<PyLocalClient>> PyLocalClient::Get(
   options.set_platform(platform);
   TF_ASSIGN_OR_RETURN(LocalClient * client,
                       ClientLibrary::GetOrCreateLocalClient(options));
-  return absl::make_unique<PyLocalClient>(platform_name, client, asynchronous);
+  return std::make_shared<PyLocalClient>(platform_name, client, asynchronous);
 }
 
 PyLocalClient::PyLocalClient(std::string platform_name, LocalClient* client,
@@ -210,9 +215,9 @@ StatusOr<pybind11::object> PyLocalClient::TransferFromOutfeed(
 }
 
 static StatusOr<PyLocalBuffer> TransferHostToDeviceAsync(
-    const PythonBufferTree& tree, int device_ordinal, PyLocalClient* client,
-    const Device& device) {
-  DeviceMemoryAllocator* allocator =
+    const PythonBufferTree& tree, int device_ordinal,
+    std::shared_ptr<PyLocalClient> client, const Device& device) {
+  se::DeviceMemoryAllocator* allocator =
       client->client()->backend().memory_allocator();
   TransferManager* transfer_manager =
       client->client()->backend().transfer_manager();
@@ -255,13 +260,13 @@ static StatusOr<PyLocalBuffer> TransferHostToDeviceAsync(
     device.ThenReleaseOnWorkerThread(device.host_to_device_stream(),
                                      device_buffer);
   }
-  return PyLocalBuffer(shape, std::move(device_buffer), client);
+  return PyLocalBuffer(shape, std::move(device_buffer), std::move(client));
 }
 
 /* static */
-StatusOr<PyLocalBuffer> PyLocalBuffer::FromPython(const py::object& argument,
-                                                  PyLocalClient* client,
-                                                  int device_ordinal) {
+StatusOr<PyLocalBuffer> PyLocalBuffer::FromPython(
+    const py::object& argument, std::shared_ptr<PyLocalClient> client,
+    int device_ordinal) {
   tensorflow::profiler::TraceMe traceme("PyLocalBuffer::FromPython");
   TF_ASSIGN_OR_RETURN(PythonBufferTree tree, GetPythonBufferTree(argument));
 
@@ -277,13 +282,13 @@ StatusOr<PyLocalBuffer> PyLocalBuffer::FromPython(const py::object& argument,
           << " device ordinal: " << device_ordinal;
 
   const Device& device = client->device(device_ordinal);
-  TF_ASSIGN_OR_RETURN(
-      PyLocalBuffer buffer,
-      TransferHostToDeviceAsync(tree, device_ordinal, client, device));
+  TF_ASSIGN_OR_RETURN(PyLocalBuffer buffer,
+                      TransferHostToDeviceAsync(tree, device_ordinal,
+                                                std::move(client), device));
 
   device.ThenRelease(device.host_to_device_stream(), std::move(py_buffer_ref));
   if (!device.asynchronous()) {
-    device.host_to_device_stream()->BlockHostUntilDone();
+    TF_RETURN_IF_ERROR(device.host_to_device_stream()->BlockHostUntilDone());
   }
   return buffer;
 }
@@ -291,7 +296,7 @@ StatusOr<PyLocalBuffer> PyLocalBuffer::FromPython(const py::object& argument,
 /*static */ StatusOr<std::vector<PyLocalBuffer>>
 PyLocalBuffer::FromPythonValues(
     const std::vector<std::pair<py::object, int>>& arguments,
-    PyLocalClient* client) {
+    std::shared_ptr<PyLocalClient> client) {
   tensorflow::profiler::TraceMe traceme("PyLocalBuffer::FromPythonValues");
   int num_arguments = static_cast<int>(arguments.size());
   std::vector<PyLocalBuffer> outputs(num_arguments);
@@ -344,7 +349,7 @@ PyLocalBuffer::FromPythonValues(
     device.ThenRelease(device.host_to_device_stream(),
                        std::move(transfers[i].py_buffer_ref));
     if (!device.asynchronous()) {
-      device.host_to_device_stream()->BlockHostUntilDone();
+      TF_RETURN_IF_ERROR(device.host_to_device_stream()->BlockHostUntilDone());
     }
   }
 
@@ -355,8 +360,8 @@ PyLocalBuffer::FromPythonValues(
 }
 
 /* static */ StatusOr<PyLocalBuffer> PyLocalBuffer::MakeTuple(
-    const std::vector<PyLocalBuffer> buffers, PyLocalClient* client,
-    int device_ordinal) {
+    const std::vector<PyLocalBuffer> buffers,
+    std::shared_ptr<PyLocalClient> client, int device_ordinal) {
   std::vector<xla::Shape> host_shapes;
   std::vector<std::shared_ptr<PySharedDeviceBuffer>> device_buffers;
   host_shapes.reserve(buffers.size());
@@ -367,7 +372,7 @@ PyLocalBuffer::FromPythonValues(
     host_shapes.push_back(buffer.on_host_shape());
     device_buffers.push_back(buffer.device_buffer());
   }
-  DeviceMemoryAllocator* allocator =
+  se::DeviceMemoryAllocator* allocator =
       client->client()->backend().memory_allocator();
   TransferManager* transfer_manager =
       client->client()->backend().transfer_manager();
@@ -382,7 +387,7 @@ PyLocalBuffer::FromPythonValues(
                           device_buffers, transfer_manager, allocator,
                           device_ordinal, definition_event));
   PyLocalBuffer buffer(ShapeUtil::MakeTupleShape(host_shapes), tuple_buffer,
-                       client);
+                       std::move(client));
 
   // TODO(phawkins): extend TransferManager so we do not need to form a full
   // ShapedBuffer just to write the root tuple index table.
@@ -393,8 +398,8 @@ PyLocalBuffer::FromPythonValues(
     // Wait for the compute stream so that memory allocations are synchronized.
     device.host_to_device_stream()->ThenWaitFor(device.compute_stream());
   }
-  transfer_manager->WriteRootTupleIndexTable(device.host_to_device_stream(),
-                                             shaped_buffer);
+  TF_RETURN_IF_ERROR(transfer_manager->WriteRootTupleIndexTable(
+      device.host_to_device_stream(), shaped_buffer));
   if (definition_event) {
     definition_event->RecordOnStream(device.host_to_device_stream());
   }
@@ -404,7 +409,7 @@ PyLocalBuffer::FromPythonValues(
                                      std::move(tuple_buffer));
   }
   if (!device.asynchronous()) {
-    device.host_to_device_stream()->BlockHostUntilDone();
+    TF_RETURN_IF_ERROR(device.host_to_device_stream()->BlockHostUntilDone());
   }
 
   return buffer;
@@ -412,10 +417,10 @@ PyLocalBuffer::FromPythonValues(
 
 PyLocalBuffer::PyLocalBuffer(
     Shape on_host_shape, std::shared_ptr<PySharedDeviceBuffer> device_buffer,
-    PyLocalClient* client)
-    : on_host_shape_(std::move(on_host_shape)),
-      device_buffer_(std::move(device_buffer)),
-      client_(client) {}
+    std::shared_ptr<PyLocalClient> client)
+    : client_(std::move(client)),
+      on_host_shape_(std::move(on_host_shape)),
+      device_buffer_(std::move(device_buffer)) {}
 
 StatusOr<py::object> PyLocalBuffer::ToPython() const {
   tensorflow::profiler::TraceMe traceme("PyLocalBuffer::ToPython");
@@ -462,10 +467,10 @@ StatusOr<std::vector<PyLocalBuffer>> PyLocalBuffer::DestructureTuple() {
 
 PyLocalExecutable::PyLocalExecutable(
     std::shared_ptr<LocalExecutable> executable,
-    DeviceAssignment device_assignment, PyLocalClient* client)
-    : executable_(std::move(executable)),
-      device_assignment_(std::move(device_assignment)),
-      client_(client) {}
+    DeviceAssignment device_assignment, std::shared_ptr<PyLocalClient> client)
+    : client_(std::move(client)),
+      executable_(std::move(executable)),
+      device_assignment_(std::move(device_assignment)) {}
 
 std::vector<int> PyLocalExecutable::DeviceOrdinals() const {
   int num_replicas = device_assignment_.replica_count();
@@ -543,7 +548,7 @@ StatusOr<PyLocalBuffer> PyLocalExecutable::ExecuteHelper(
     device.ThenReleaseOnWorkerThread(device.compute_stream(), executable_);
   }
   if (!device.asynchronous()) {
-    device.compute_stream()->BlockHostUntilDone();
+    TF_RETURN_IF_ERROR(device.compute_stream()->BlockHostUntilDone());
   }
   return PyLocalBuffer(on_host_shape, std::move(out_buffer), client_);
 }
@@ -652,7 +657,7 @@ StatusOr<std::vector<PyLocalBuffer>> PyLocalExecutable::ExecutePerReplica(
 PyLocalExecutable::Compile(const XlaComputation& computation,
                            std::vector<Shape> argument_layouts,
                            const ExecutableBuildOptions* build_options,
-                           PyLocalClient* client) {
+                           std::shared_ptr<PyLocalClient> client) {
   tensorflow::profiler::TraceMe traceme("LocalExecutable::Compile");
   std::vector<const Shape*> argument_layout_pointers;
   argument_layout_pointers.reserve(argument_layouts.size());
@@ -705,7 +710,7 @@ PyLocalExecutable::Compile(const XlaComputation& computation,
 
   return absl::make_unique<PyLocalExecutable>(
       std::shared_ptr<LocalExecutable>(std::move(local_executable)),
-      std::move(device_assignment), client);
+      std::move(device_assignment), std::move(client));
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/local_client.h b/tensorflow/compiler/xla/python/local_client.h
index 83cb9bbc372..6a366fe8934 100644
--- a/tensorflow/compiler/xla/python/local_client.h
+++ b/tensorflow/compiler/xla/python/local_client.h
@@ -169,12 +169,13 @@ class PyLocalClient {
  public:
   // Initializes a local XLA client for `platform_name`. Returns an error if no
   // such platform exists, or if the platform has no visible devices.
-  static StatusOr<std::unique_ptr<PyLocalClient>> Get(
-      const std::string& platform_name, const std::string& xla_platform_id,
+  static StatusOr<std::shared_ptr<PyLocalClient>> Get(
+      const std::string& platform_name, const std::string& xla_platform_name,
       bool asynchronous);
 
   explicit PyLocalClient(std::string platform_name, LocalClient* client,
                          bool asynchronous);
+  virtual ~PyLocalClient() = default;
 
   Status TransferToInfeed(const LiteralSlice& literal, int device_ordinal);
   StatusOr<pybind11::object> TransferFromOutfeed(const Shape& shape,
@@ -192,7 +193,7 @@ class PyLocalClient {
 
   PythonRefManager& py_ref_manager() { return py_ref_manager_; }
 
- private:
+ protected:
   std::string platform_name_;
   LocalClient* client_;
   std::vector<std::unique_ptr<Device>> devices_;
@@ -205,29 +206,30 @@ class PyLocalClient {
 // Holds a reference from Python to one or more device buffers.
 class PyLocalBuffer {
  public:
-  static StatusOr<PyLocalBuffer> FromPython(const pybind11::object& argument,
-                                            PyLocalClient* client,
-                                            int device_ordinal);
+  static StatusOr<PyLocalBuffer> FromPython(
+      const pybind11::object& argument, std::shared_ptr<PyLocalClient> client,
+      int device_ordinal);
 
   // Converts multiple (python object, device ordinal) pairs into
   // PyLocalBuffers in parallel.
   static StatusOr<std::vector<PyLocalBuffer>> FromPythonValues(
       const std::vector<std::pair<pybind11::object, int>>& argument,
-      PyLocalClient* client);
+      std::shared_ptr<PyLocalClient> client);
 
   static StatusOr<PyLocalBuffer> MakeTuple(
-      const std::vector<PyLocalBuffer> buffers, PyLocalClient* client,
-      int device_ordinal);
+      const std::vector<PyLocalBuffer> buffers,
+      std::shared_ptr<PyLocalClient> client, int device_ordinal);
 
   PyLocalBuffer() = default;
   PyLocalBuffer(Shape on_host_shape,
                 std::shared_ptr<PySharedDeviceBuffer> device_buffer,
-                PyLocalClient* client);
+                std::shared_ptr<PyLocalClient> client);
   StatusOr<pybind11::object> ToPython() const;
   const Shape& on_host_shape() const { return on_host_shape_; }
   const std::shared_ptr<PySharedDeviceBuffer>& device_buffer() const {
     return device_buffer_;
   }
+  int device_ordinal() const { return device_buffer_->device_ordinal(); }
 
   void Delete() {
     device_buffer_ = nullptr;
@@ -242,9 +244,9 @@ class PyLocalBuffer {
   StatusOr<std::vector<PyLocalBuffer>> DestructureTuple();
 
  private:
+  std::shared_ptr<PyLocalClient> client_ = nullptr;
   Shape on_host_shape_;
   std::shared_ptr<PySharedDeviceBuffer> device_buffer_;
-  PyLocalClient* client_ = nullptr;
 };
 
 // Represents a compiled computation that can be executed given handles to
@@ -254,10 +256,12 @@ class PyLocalExecutable {
   // Compiles a computation to an executable.
   static StatusOr<std::unique_ptr<PyLocalExecutable>> Compile(
       const XlaComputation& computation, std::vector<Shape> argument_layouts,
-      const ExecutableBuildOptions* build_options, PyLocalClient* client);
+      const ExecutableBuildOptions* build_options,
+      std::shared_ptr<PyLocalClient> client);
 
   PyLocalExecutable(std::shared_ptr<LocalExecutable> executable,
-                    DeviceAssignment device_assignment, PyLocalClient* client);
+                    DeviceAssignment device_assignment,
+                    std::shared_ptr<PyLocalClient> client);
 
   int num_replicas() const {
     return executable_->build_options().num_replicas();
@@ -285,9 +289,9 @@ class PyLocalExecutable {
   StatusOr<PyLocalBuffer> ExecuteHelper(
       absl::Span<PyLocalBuffer* const> argument_handles, int replica);
 
+  std::shared_ptr<PyLocalClient> const client_;
   std::shared_ptr<LocalExecutable> executable_;
   const DeviceAssignment device_assignment_;
-  PyLocalClient* const client_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/shared_device_buffer.cc b/tensorflow/compiler/xla/python/shared_device_buffer.cc
index 6ff733c1ed5..f9fbd9eb933 100644
--- a/tensorflow/compiler/xla/python/shared_device_buffer.cc
+++ b/tensorflow/compiler/xla/python/shared_device_buffer.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/python/shared_device_buffer.h"
 
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 
@@ -47,14 +47,14 @@ void BufferDefinitionEvent::WaitForEventOnStream(se::Stream* stream) {
 static std::shared_ptr<PySharedDeviceBuffer>
 BufferFromScopedShapedBufferIterator(
     const Shape& on_device_shape, int device_ordinal,
-    DeviceMemoryAllocator* allocator,
+    se::DeviceMemoryAllocator* allocator,
     ShapeTree<se::DeviceMemoryBase>::iterator* iterator,
     const ShapeTree<se::DeviceMemoryBase>::iterator& end,
     const std::shared_ptr<BufferDefinitionEvent>& definition_event) {
   CHECK(*iterator != end);
 
-  OwningDeviceMemory device_memory((*iterator)->second, device_ordinal,
-                                   allocator);
+  se::OwningDeviceMemory device_memory((*iterator)->second, device_ordinal,
+                                       allocator);
   (*iterator)->second = se::DeviceMemoryBase();
   ++*iterator;
 
@@ -90,7 +90,7 @@ PySharedDeviceBuffer::FromScopedShapedBuffer(
 /* static */ StatusOr<std::shared_ptr<PySharedDeviceBuffer>>
 PySharedDeviceBuffer::MakeTuple(
     std::vector<std::shared_ptr<PySharedDeviceBuffer>> children,
-    TransferManager* transfer_manager, DeviceMemoryAllocator* allocator,
+    TransferManager* transfer_manager, se::DeviceMemoryAllocator* allocator,
     int device_ordinal,
     std::shared_ptr<BufferDefinitionEvent> definition_event) {
   std::vector<Shape> child_shapes;
@@ -102,7 +102,7 @@ PySharedDeviceBuffer::MakeTuple(
 
   Shape shape = ShapeUtil::MakeTupleShape(child_shapes);
   TF_ASSIGN_OR_RETURN(
-      OwningDeviceMemory device_memory,
+      se::OwningDeviceMemory device_memory,
       allocator->Allocate(device_ordinal,
                           transfer_manager->GetByteSizeRequirement(shape)));
   return std::make_shared<PySharedDeviceBuffer>(
@@ -113,10 +113,10 @@ PySharedDeviceBuffer::MakeTuple(
 /* static */ StatusOr<std::shared_ptr<PySharedDeviceBuffer>>
 PySharedDeviceBuffer::MakeArray(
     Shape on_device_shape, TransferManager* transfer_manager,
-    DeviceMemoryAllocator* allocator, int device_ordinal,
+    se::DeviceMemoryAllocator* allocator, int device_ordinal,
     std::shared_ptr<BufferDefinitionEvent> definition_event) {
   TF_ASSIGN_OR_RETURN(
-      OwningDeviceMemory device_memory,
+      se::OwningDeviceMemory device_memory,
       allocator->Allocate(
           device_ordinal,
           transfer_manager->GetByteSizeRequirement(on_device_shape)));
@@ -153,7 +153,7 @@ ShapedBuffer PySharedDeviceBuffer::AsShapedBuffer(
 }
 
 PySharedDeviceBuffer::PySharedDeviceBuffer(
-    Shape on_device_shape, OwningDeviceMemory device_memory,
+    Shape on_device_shape, se::OwningDeviceMemory device_memory,
     std::vector<std::shared_ptr<PySharedDeviceBuffer>> children,
     std::shared_ptr<BufferDefinitionEvent> definition_event)
     : on_device_shape_(std::move(on_device_shape)),
diff --git a/tensorflow/compiler/xla/python/shared_device_buffer.h b/tensorflow/compiler/xla/python/shared_device_buffer.h
index 705b3a0cfe4..6a57d7fd6a5 100644
--- a/tensorflow/compiler/xla/python/shared_device_buffer.h
+++ b/tensorflow/compiler/xla/python/shared_device_buffer.h
@@ -17,11 +17,11 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_PYTHON_SHARED_DEVICE_BUFFER_H_
 
 #include "absl/container/flat_hash_set.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
-#include "tensorflow/compiler/xla/service/owning_device_memory.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
+#include "tensorflow/stream_executor/owning_device_memory.h"
 
 namespace xla {
 
@@ -93,14 +93,14 @@ class PySharedDeviceBuffer {
   // Makes a tuple buffer. Does not initialize the tuple table.
   static StatusOr<std::shared_ptr<PySharedDeviceBuffer>> MakeTuple(
       std::vector<std::shared_ptr<PySharedDeviceBuffer>> children,
-      TransferManager* transfer_manager, DeviceMemoryAllocator* allocator,
+      TransferManager* transfer_manager, se::DeviceMemoryAllocator* allocator,
       int device_ordinal,
       std::shared_ptr<BufferDefinitionEvent> definition_event);
 
   // Makes an uninitialized array buffer.
   static StatusOr<std::shared_ptr<PySharedDeviceBuffer>> MakeArray(
       Shape on_device_shape, TransferManager* transfer_manager,
-      DeviceMemoryAllocator* allocator, int device_ordinal,
+      se::DeviceMemoryAllocator* allocator, int device_ordinal,
       std::shared_ptr<BufferDefinitionEvent> definition_event);
 
   // Builds a ShapedBuffer view onto the buffers of 'tree'. Since
@@ -113,7 +113,7 @@ class PySharedDeviceBuffer {
   const std::vector<std::shared_ptr<PySharedDeviceBuffer>>& children() const {
     return children_;
   }
-  const OwningDeviceMemory& device_memory() const { return device_memory_; }
+  const se::OwningDeviceMemory& device_memory() const { return device_memory_; }
   int device_ordinal() const { return device_memory_.device_ordinal(); }
   const std::shared_ptr<BufferDefinitionEvent> definition_event() const {
     return definition_event_;
@@ -121,7 +121,7 @@ class PySharedDeviceBuffer {
 
   PySharedDeviceBuffer() = default;
   PySharedDeviceBuffer(
-      Shape on_device_shape, OwningDeviceMemory device_memory,
+      Shape on_device_shape, se::OwningDeviceMemory device_memory,
       std::vector<std::shared_ptr<PySharedDeviceBuffer>> children,
       std::shared_ptr<BufferDefinitionEvent> definition_event);
 
@@ -130,7 +130,7 @@ class PySharedDeviceBuffer {
   // one-to-one with the tree of device buffers, so to avoid representational
   // awkwardness we maintain on-host shapes separately.
   Shape on_device_shape_;
-  OwningDeviceMemory device_memory_;
+  se::OwningDeviceMemory device_memory_;
   std::vector<std::shared_ptr<PySharedDeviceBuffer>> children_;
 
   // An event that is triggered when the content of one or more buffers is
diff --git a/tensorflow/compiler/xla/python/types.cc b/tensorflow/compiler/xla/python/types.cc
index 2d0eb8af855..2e76b89faf7 100644
--- a/tensorflow/compiler/xla/python/types.cc
+++ b/tensorflow/compiler/xla/python/types.cc
@@ -16,15 +16,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/python/types.h"
 
 #include "absl/container/flat_hash_map.h"
-#include "tensorflow/compiler/xla/service/owning_device_memory.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/stream_executor/owning_device_memory.h"
 
 namespace xla {
 
 namespace py = pybind11;
 
-xla::StatusOr<PrimitiveType> NumpyTypeToPrimitiveType(
-    const py::dtype& np_type) {
+xla::StatusOr<PrimitiveType> DtypeToPrimitiveType(const py::dtype& np_type) {
   static auto* types =
       new absl::flat_hash_map<std::pair<char, int>, PrimitiveType>({
           {{'b', 1}, PRED},
@@ -50,6 +49,42 @@ xla::StatusOr<PrimitiveType> NumpyTypeToPrimitiveType(
   return it->second;
 }
 
+xla::StatusOr<py::dtype> PrimitiveTypeToDtype(PrimitiveType type) {
+  switch (type) {
+    case PRED:
+      return py::dtype::of<bool>();
+    case S8:
+      return py::dtype::of<int8>();
+    case S16:
+      return py::dtype::of<int16>();
+    case S32:
+      return py::dtype::of<int32>();
+    case S64:
+      return py::dtype::of<int64>();
+    case U8:
+      return py::dtype::of<uint8>();
+    case U16:
+      return py::dtype::of<uint16>();
+    case U32:
+      return py::dtype::of<uint32>();
+    case U64:
+      return py::dtype::of<uint64>();
+    case F16:
+      return py::dtype("e");
+    case F32:
+      return py::dtype::of<float>();
+    case F64:
+      return py::dtype::of<double>();
+    case C64:
+      return py::dtype::of<std::complex<float>>();
+    case C128:
+      return py::dtype::of<std::complex<double>>();
+    default:
+      return Unimplemented("Unimplemented primitive type %s",
+                           PrimitiveType_Name(type));
+  }
+}
+
 // Returns a numpy-style format descriptor string for `type`.
 StatusOr<std::string> FormatDescriptorForPrimitiveType(PrimitiveType type) {
   switch (type) {
@@ -159,4 +194,20 @@ StatusOr<PythonBufferTree> GetPythonBufferTree(const py::object& argument) {
   return tree;
 }
 
+py::tuple IntSpanToTuple(absl::Span<int64 const> xs) {
+  py::tuple out(xs.size());
+  for (int i = 0; i < xs.size(); ++i) {
+    out[i] = py::int_(xs[i]);
+  }
+  return out;
+}
+
+std::vector<int64> IntSequenceToVector(const py::object& sequence) {
+  std::vector<int64> output;
+  for (auto item : sequence) {
+    output.push_back(item.cast<int64>());
+  }
+  return output;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/types.h b/tensorflow/compiler/xla/python/types.h
index 02244cf8d5e..c2be8c606f6 100644
--- a/tensorflow/compiler/xla/python/types.h
+++ b/tensorflow/compiler/xla/python/types.h
@@ -32,29 +32,48 @@ limitations under the License.
 
 namespace xla {
 
-// Converts a pybind11-style NumPy dtype to a PrimitiveType.
-StatusOr<PrimitiveType> NumpyTypeToPrimitiveType(
-    const pybind11::dtype& np_type);
+// Helper that converts a failing StatusOr to an exception.
+// For use only inside pybind11 code.
+template <typename T>
+T ValueOrThrow(StatusOr<T> v) {
+  if (!v.ok()) {
+    throw std::runtime_error(v.status().ToString());
+  }
+  return v.ConsumeValueOrDie();
+}
+
+// Converts a NumPy dtype to a PrimitiveType.
+StatusOr<PrimitiveType> DtypeToPrimitiveType(const pybind11::dtype& np_type);
+
+// Converts a PrimitiveType to a Numpy dtype.
+StatusOr<pybind11::dtype> PrimitiveTypeToDtype(PrimitiveType type);
 
 // Converts a literal to (possibly-nested tuples of) NumPy arrays.
 // The literal's leaf arrays are not copied; instead the NumPy arrays share
 // buffers with the literals. Takes ownership of `literal` and keeps the
 // necessary pieces alive using Python reference counting.
 // Requires the GIL.
-StatusOr<pybind11::object> LiteralToPython(
-    std::unique_ptr<xla::Literal> literal);
+StatusOr<pybind11::object> LiteralToPython(std::unique_ptr<Literal> literal);
 
 // Converts a Python object into an XLA shape and a vector of leaf buffers.
 // The leaf buffers correspond to a depth-first, left-to-right traversal of
 // the Python value.
 // Requires the GIL.
 struct PythonBufferTree {
-  absl::InlinedVector<xla::BorrowingLiteral, 1> leaves;
-  xla::Shape shape;
+  absl::InlinedVector<BorrowingLiteral, 1> leaves;
+  Shape shape;
 };
 StatusOr<PythonBufferTree> GetPythonBufferTree(
     const pybind11::object& argument);
 
+// Converts a sequence of int64s to a Python tuple of ints.
+// Pybind11 by default converts a std::vector<int64> to a Python list; for
+// shapes we frequently want a tuple instead.
+pybind11::tuple IntSpanToTuple(absl::Span<int64 const> xs);
+
+// Converts a Python sequence of integers to a std::vector<int64>
+std::vector<int64> IntSequenceToVector(const pybind11::object& sequence);
+
 }  // namespace xla
 
 // This namespace is a documented pybind11 extension point.
@@ -161,7 +180,7 @@ struct type_caster<xla::BorrowingLiteral> {
     for (int i = 0; i < array.ndim(); ++i) {
       dims[i] = array.shape(i);
     }
-    auto type = xla::NumpyTypeToPrimitiveType(array.dtype());
+    auto type = xla::DtypeToPrimitiveType(array.dtype());
     if (!type.ok()) {
       throw std::runtime_error(type.status().ToString());
     }
diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index 0c4d5a0bc81..b562cf94736 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -16,10 +16,11 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "absl/strings/string_view.h"
+#include "absl/hash/hash.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
+#include "include/pybind11/numpy.h"
 #include "include/pybind11/pybind11.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/lib/comparators.h"
@@ -33,7 +34,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/python/local_client.h"
 #include "tensorflow/compiler/xla/python/types.h"
 #include "tensorflow/compiler/xla/python/xrt.h"
-#include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -129,34 +129,95 @@ PYBIND11_MODULE(xla_extension, m) {
       .value("TOKEN", TOKEN);
 
   // Shapes
-  py::class_<Shape>(m, "Shape")
+  py::class_<Shape> shape_class(m, "Shape");
+  shape_class
       .def_static(
-          "Tuple",
+          "tuple_shape",
           [](std::vector<Shape> shapes) -> Shape {
             return ShapeUtil::MakeTupleShape(shapes);
           },
-          "Makes a tuple shape.")
+          "Constructs a tuple shape.")
       .def_static(
-          "Array",
-          [](PrimitiveType type, std::vector<int64> dims,
-             absl::optional<std::vector<int64>> layout) -> Shape {
-            if (layout) {
-              return ShapeUtil::MakeShapeWithLayout(type, dims, *layout);
+          "array_shape",
+          [](PrimitiveType type, py::object dims_seq,
+             absl::optional<py::object> layout_seq) -> Shape {
+            std::vector<int64> dims = IntSequenceToVector(dims_seq);
+            if (layout_seq) {
+              std::vector<int64> layout = IntSequenceToVector(*layout_seq);
+              return ShapeUtil::MakeShapeWithLayout(type, dims, layout);
             } else {
               Shape shape = ShapeUtil::MakeShape(type, dims);
               shape.clear_layout();
               return shape;
             }
           },
-          "Makes an array shape.", py::arg("type"), py::arg("dims"),
+          "Constructs an array shape.", py::arg("type"), py::arg("dims"),
+          py::arg("layout") = absl::nullopt)
+      .def_static(
+          "array_shape",
+          [](py::dtype dtype, py::object dims_seq,
+             absl::optional<py::object> layout_seq) -> Shape {
+            PrimitiveType type = ValueOrThrow(DtypeToPrimitiveType(dtype));
+            std::vector<int64> dims = IntSequenceToVector(dims_seq);
+            if (layout_seq) {
+              std::vector<int64> layout = IntSequenceToVector(*layout_seq);
+              return ShapeUtil::MakeShapeWithLayout(type, dims, layout);
+            } else {
+              Shape shape = ShapeUtil::MakeShape(type, dims);
+              shape.clear_layout();
+              return shape;
+            }
+          },
+          "Constructs an array shape.", py::arg("type"), py::arg("dims"),
           py::arg("layout") = absl::nullopt)
       .def("dimensions",
-           static_cast<const std::vector<int64>& (Shape::*)() const>(
-               &Shape::dimensions))
-      .def("element_type", &Shape::element_type)
+           [](const Shape& shape) -> py::tuple {
+             return IntSpanToTuple(shape.dimensions());
+           })
+      .def("xla_element_type", &Shape::element_type)
+      .def("element_type",
+           [](const Shape& shape) {
+             return ValueOrThrow(PrimitiveTypeToDtype(shape.element_type()));
+           })
+      .def("numpy_dtype",
+           [](const Shape& shape) {
+             if (shape.IsTuple()) {
+               return py::dtype("O");
+             }
+             return ValueOrThrow(PrimitiveTypeToDtype(shape.element_type()));
+           })
+      .def("is_tuple", &Shape::IsTuple)
+      .def("is_array", &Shape::IsArray)
+      .def("rank", &Shape::rank)
+      .def("to_serialized_proto",
+           [](const Shape& shape) {
+             ShapeProto proto = shape.ToProto();
+             return py::bytes(proto.SerializeAsString());
+           })
       .def("tuple_shapes",
-           static_cast<const std::vector<Shape>& (Shape::*)() const>(
-               &Shape::tuple_shapes))
+           [](const Shape& shape) {
+             return std::vector<Shape>(shape.tuple_shapes());
+           })
+      .def(
+          "with_major_to_minor_layout_if_absent",
+          [](const Shape& shape) {
+            Shape out = shape;
+            ShapeUtil::ForEachMutableSubshape(
+                &out, [](Shape* subshape, const ShapeIndex&) {
+                  if (!subshape->has_layout()) {
+                    LayoutUtil::SetToDefaultLayout(subshape);
+                  }
+                });
+            return out;
+          },
+          "Returns a copy of a shape with missing layouts set to "
+          "major-to-minor.")
+      .def("__eq__", [](const Shape& shape,
+                        const Shape& other) { return shape == other; })
+      .def("__ne__", [](const Shape& shape,
+                        const Shape& other) { return shape != other; })
+      .def("__hash__",
+           [](const Shape& shape) { return absl::Hash<Shape>()(shape); })
       .def("__repr__", [](const Shape& shape) {
         return shape.ToString(/*print_layouts=*/true);
       });
@@ -171,10 +232,10 @@ PYBIND11_MODULE(xla_extension, m) {
             *program_shape.mutable_result() = result;
             return program_shape;
           }))
-      .def("Parameters",
+      .def("parameter_shapes",
            static_cast<const std::vector<Shape>& (ProgramShape::*)() const>(
                &ProgramShape::parameters))
-      .def("Result", &ProgramShape::result)
+      .def("result_shape", &ProgramShape::result)
       .def("__repr__", &ProgramShape::ToString);
 
   // Literals
@@ -211,22 +272,25 @@ PYBIND11_MODULE(xla_extension, m) {
   // CPU custom-call targets.
   m.def("RegisterCpuCustomCallTarget", &RegisterCpuCustomCallTarget);
 
-  // The LocalClient object allows dynamic attributes to allow external backends
-  // (e.g., TPU) to stash private data in the client.
-  py::class_<PyLocalClient>(m, "LocalClient", py::dynamic_attr())
-      .def_static("Get", &PyLocalClient::Get)
+  py::class_<PyLocalClient, std::shared_ptr<PyLocalClient>>(m, "LocalClient")
+      .def_static("Get", &PyLocalClient::Get, py::arg("platform"),
+                  py::arg("xla_platform_id"), py::arg("asynchronous"))
       .def("DeviceCount", &PyLocalClient::device_count)
       .def("TransferToInfeed", &PyLocalClient::TransferToInfeed)
       .def("TransferFromOutfeed", &PyLocalClient::TransferFromOutfeed);
 
   py::class_<PyLocalBuffer>(m, "PyLocalBuffer")
-      .def_static("FromPython", &PyLocalBuffer::FromPython)
-      .def_static("FromPythonValues", &PyLocalBuffer::FromPythonValues)
-      .def_static("MakeTuple", &PyLocalBuffer::MakeTuple)
-      .def("Delete", &PyLocalBuffer::Delete)
-      .def("DestructureTuple", &PyLocalBuffer::DestructureTuple)
-      .def("ToPython", &PyLocalBuffer::ToPython)
-      .def("shape", &PyLocalBuffer::on_host_shape);
+      .def_static("from_python", &PyLocalBuffer::FromPython)
+      .def_static("from_python_values", &PyLocalBuffer::FromPythonValues)
+      .def_static("make_tuple", &PyLocalBuffer::MakeTuple)
+      .def("delete", &PyLocalBuffer::Delete)
+      .def("destructure", &PyLocalBuffer::DestructureTuple)
+      .def("to_py", &PyLocalBuffer::ToPython)
+      .def("shape", &PyLocalBuffer::on_host_shape)
+      .def("device", &PyLocalBuffer::device_ordinal)
+      .def("is_deleted", [](const PyLocalBuffer& buffer) {
+        return buffer.device_buffer() == nullptr;
+      });
 
   py::class_<PyLocalExecutable>(m, "LocalExecutable")
       .def_static("Compile", &PyLocalExecutable::Compile,
@@ -301,7 +365,12 @@ PYBIND11_MODULE(xla_extension, m) {
   // XlaBuilder.
   py::module ops = m.def_submodule("ops", "XLA operations");
 
+  ops.def("AllReduce",
+          static_cast<XlaOp (*)(
+              XlaOp, const XlaComputation&, absl::Span<const ReplicaGroup>,
+              const absl::optional<ChannelHandle>&)>(&CrossReplicaSum));
   ops.def("AllToAll", &AllToAll);
+  ops.def("CollectivePermute", &CollectivePermute);
   ops.def("CrossReplicaSum",
           static_cast<XlaOp (*)(XlaOp, absl::Span<const ReplicaGroup>)>(
               &CrossReplicaSum));
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 73235b528f3..f25f09dc412 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -28,7 +28,6 @@ import os
 import numpy as np
 
 import six
-from six.moves import xrange
 
 # Note this module does *not* depend on any Python protocol buffers. The XLA
 # Python bindings are currently packaged both as part of jaxlib and as part
@@ -71,18 +70,10 @@ class Backend(object):
         for pyval, device in pyvals_and_devices
     ]
 
-  @abc.abstractmethod
-  def delete_buffer(self, c_buffer):
-    """Deletes buffer `c_buffer`."""
-
   @abc.abstractmethod
   def make_tuple(self, c_buffers, device_ordinal):
     """Makes a tuple from a sequence of backend buffer objects."""
 
-  @abc.abstractmethod
-  def destructure_tuple(self, c_buffer):
-    """Destructures a tuple buffer into a sequence of buffers."""
-
   @abc.abstractmethod
   def compile(self, computation, compile_options):
     """Compiles a computation. Returns an executable."""
@@ -103,47 +94,38 @@ class Backend(object):
 class LocalBackend(Backend):
   """XLA backend implemented using the in-process xla::LocalClient API."""
 
-  def __init__(self, platform=None, xla_platform_id=None, asynchronous=False):
+  def __init__(self, platform, client):
     """Creates a new LocalBackend.
 
     Args:
       platform: A string; the user-visible platform name, e.g. 'gpu'.
-      xla_platform_id: A string; XLA's name for the platform, e.g., 'CUDA'.
-      asynchronous: A boolean; should we enable asynchronous execution?
-        (Experimental.)
+      client: An _xla.PyLocalClient object.
     """
     super(LocalBackend, self).__init__(platform)
-    self.client = _xla.LocalClient.Get(platform, xla_platform_id, asynchronous)
+    self.client = client
 
   def device_count(self):
     return self.client.DeviceCount()
 
   def buffer_from_pyval(self, pyval, device=0):
-    return _xla.PyLocalBuffer.FromPython(pyval, self.client, device)
+    return _xla.PyLocalBuffer.from_python(pyval, self.client, device)
 
   def buffers_from_pyvals(self, pyvals_and_devices):
-    return _xla.PyLocalBuffer.FromPythonValues(pyvals_and_devices, self.client)
-
-  def delete_buffer(self, c_buffer):
-    c_buffer.Delete()
+    return _xla.PyLocalBuffer.from_python_values(pyvals_and_devices,
+                                                 self.client)
 
   def make_tuple(self, c_buffers, device_ordinal):
-    return _xla.PyLocalBuffer.MakeTuple(c_buffers, self.client, device_ordinal)
-
-  def destructure_tuple(self, c_buffer):
-    return c_buffer.DestructureTuple()
+    return _xla.PyLocalBuffer.make_tuple(c_buffers, self.client, device_ordinal)
 
   def compile(self, c_computation, compile_options):
     options = _xla.ExecutableBuildOptions()
     options.num_replicas = compile_options.num_replicas
     if compile_options.argument_layouts:
-      argument_layouts = [
-          s.as_xla_shape() for s in compile_options.argument_layouts
-      ]
+      argument_layouts = compile_options.argument_layouts
     else:
-      argument_layouts = c_computation.GetProgramShape().Parameters()
+      argument_layouts = c_computation.GetProgramShape().parameter_shapes()
     if compile_options.result_layout:
-      options.result_layout = compile_options.result_layout.as_xla_shape()
+      options.result_layout = compile_options.result_layout
     options.debug_options.xla_cpu_fast_math_honor_infs = True
     options.debug_options.xla_cpu_fast_math_honor_nans = True
     return _xla.LocalExecutable.Compile(c_computation, argument_layouts,
@@ -159,10 +141,22 @@ class LocalBackend(Backend):
     return executable.ExecutePerReplica(per_replica_args)
 
 
+def _cpu_backend_factory():
+  client = _xla.LocalClient.Get(
+      platform='cpu', xla_platform_id='Host', asynchronous=True)
+  return LocalBackend(platform='cpu', client=client)
+
+
+def _gpu_backend_factory():
+  client = _xla.LocalClient.Get(
+      platform='gpu', xla_platform_id='CUDA', asynchronous=False)
+  return LocalBackend(platform='gpu', client=client)
+
+
 # Backend factories, keyed by user-visible name, in increasing priority order.
 _local_backend_factories = collections.OrderedDict([
-    ('cpu', lambda: LocalBackend(platform='cpu', xla_platform_id='Host')),
-    ('gpu', lambda: LocalBackend(platform='gpu', xla_platform_id='CUDA')),
+    ('cpu', _cpu_backend_factory),
+    ('gpu', _gpu_backend_factory),
 ])
 
 
@@ -291,95 +285,12 @@ def dtype_to_etype(dtype):
   return DTYPE_TO_XLA_ELEMENT_TYPE[str(np.dtype(dtype))]
 
 
-class Buffer(object):
-  """Represents a handle to data owned by XLA.
-
-  The referent is ready for use in executing a local, compiled
-  Computation. On XLA platforms involving a device (e.g. GPU), this
-  means the referent is in device memory.
-  """
-
-  def __init__(self, c_buffer, backend, device):
-    self.c_buffer = c_buffer
-    self._backend = backend
-    self._device = device
-
-  @staticmethod
-  def from_pyval(pyval, device=0, backend=None):
-    """Copies the `pyval` to a freshly allocated on-device buffer."""
-    backend = backend or get_local_backend()
-    pyval = require_numpy_array_layout(pyval)
-    cbuf = backend.buffer_from_pyval(pyval, device)
-    return Buffer(cbuf, backend, device)
-
-  @staticmethod
-  def from_pyvals(pyvals_and_devices, backend=None):
-    """Copies multiple Python values to freshly allocated on-device buffers.
-
-    Arguments:
-      pyvals_and_devices: a list of `(pyval, device)` pairs, where `pyval` is
-      a Python value to copy (e.g., a NumPy array), and `device` is an integer
-      device ordinal.
-      backend: a Backend object, or `None` to use the default local backend.
-    Returns:
-      A list of `Buffer` objects corresponding to `pyvals_and_devices`.
-    """
-    backend = backend or get_local_backend()
-    pyvals_and_devices = [(require_numpy_array_layout(pyval), device)
-                          for pyval, device in pyvals_and_devices]
-    cbufs = backend.buffers_from_pyvals(pyvals_and_devices)
-    return [
-        Buffer(cbuf, backend, device)
-        for cbuf, (_, device) in zip(cbufs, pyvals_and_devices)
-    ]
-
-  @staticmethod
-  def make_tuple(buffers, backend=None, device=0):
-    backend = backend or get_local_backend()
-    buf = backend.make_tuple([b.c_buffer for b in buffers],
-                             device_ordinal=device)
-    return Buffer(buf, backend, device)
-
-  def to_py(self):
-    return self.c_buffer.ToPython()
-
-  def shape(self):
-    return _wrap_shape(self.c_buffer.shape())
-
-  def device(self):
-    return self._device
-
-  def delete(self):
-    if self.c_buffer is not None:
-      self._backend.delete_buffer(self.c_buffer)
-      self.c_buffer = None
-
-  def destructure(self):
-    """Assuming a tuple buffer, unpack it into constituent tuple elements."""
-    assert self.c_buffer is not None
-    result = self._backend.destructure_tuple(self.c_buffer)
-    return tuple(
-        Buffer(sub_buffer, device=self._device, backend=self._backend)
-        for sub_buffer in result)
-
-  def is_deleted(self):
-    return self.c_buffer is None
-
-
-# TODO(phawkins): Alias for backward compatibility. Remove after JAX drops
-# compatibility with Jaxlib versions older than 0.1.13.
-LocalBuffer = Buffer
-
-
-class Format(enum.IntEnum):
-  """Python copy of the Format protocol buffer enum."""
-  INVALID_FORMAT = 0
-  DENSE = 1
-  SPARSE = 2
-
+Shape = _xla.Shape
+Shape.__doc__ = """
+A Shape is an object defined in C++ that duck types like the following class:
 
 class Shape(object):
-  """Represents an XLA shape.
+  '''Represents an XLA shape.
 
   A shape is either an array shape, having rank-many integer
   dimensions and an element type (represented by a Numpy dtype), or it
@@ -388,188 +299,120 @@ class Shape(object):
     type shape =
         TupleShape of shape list
       | ArrayShape of { dimensions: int list; element_type: dtype }
+  '''
 
-  Callers are expected to instantiate this class only via the static
-  constructors: tuple_shape, array_shape, and from_pyval.
+  @staticmethod
+  def tuple_shape(tuple_shapes) -> Shape:
+    "Construct a tuple shape."
+
+  @staticmethod
+  def array_shape(element_type, dimensions, minor_to_major=None) -> Shape:
+
+  @staticmethod
+  def from_pyval(pyval) -> Shape:
+    "Returns a Shape that describes a tuple-tree of Numpy arrays."
+
+  def __eq__(self, other: Shape) -> bool:
+  def __ne__(self, other: Shape) -> bool:
+  def __hash__(self):
+  def __repr__(self):
+  def is_tuple(self) -> bool:
+  def is_array(self) -> bool:
+  def tuple_shapes(self) -> [Shape]:
+  def numpy_dtype(self) -> np.dtype:
+    "Like element_type(), but returns dtype('O') for a tuple shape."
+  def xla_element_type(self) -> PrimitiveType:
+  def element_type(self) -> np.dtype:
+  def dimensions(self) -> (int, int, ...):
+  def rank(self) -> int:
+  def minor_to_major(self) -> [int]:
+  def with_major_to_minor_layout_if_absent(self) -> Shape:
+    "Returns a copy with missing layouts set to major-to-minor."
+
+  def to_serialized_proto(self) -> bytes:
+    "Returns 'shape' as a serialized proto."
+"""
+
+ProgramShape = _xla.ProgramShape
+ProgramShape.__doc__ = """
+A ProgramShape is a C++ object that duck types like the following class.
+
+class ProgramShape(object):
+  def __init__(self, parameter_shapes, result_shape):
+  def parameter_shapes(self) -> [Shape]:
+  def result_shape(self) -> Shape:
+  def __repr__(self):
+"""
+
+
+class Buffer(object):
+  """Represents a handle to data owned by XLA.
+
+  The referent is ready for use in executing a local, compiled
+  Computation. On XLA platforms involving a device (e.g. GPU), this
+  means the referent is in device memory.
   """
 
   @staticmethod
-  def tuple_shape(tuple_shapes):
-    """Construct a tuple shape."""
-    if (not isinstance(tuple_shapes, (tuple, list)) or
-        not all(isinstance(t, Shape) for t in tuple_shapes)):
-      raise TypeError('tuple_shapes must be a tuple of Shapes')
-    return Shape(tuple_shapes, tuple)
+  def from_pyval(pyval, device=0, backend=None):
+    """Copies the `pyval` to a freshly allocated on-device buffer."""
+    backend = backend or get_local_backend()
+    pyval = require_numpy_array_layout(pyval)
+    return backend.buffer_from_pyval(pyval, device)
 
   @staticmethod
-  def array_shape(element_type, dimensions, minor_to_major=None):
-    """Construct an array shape."""
-    if (not isinstance(dimensions, tuple) or
-        not all(isinstance(i, int) for i in dimensions)):
-      dimensions = tuple(int(i) for i in dimensions)
-    return Shape(
-        dimensions, np.dtype(element_type), minor_to_major=minor_to_major)
+  def from_pyvals(pyvals_and_devices, backend=None):
+    """Copies multiple Python values to freshly allocated on-device buffers.
 
-  @staticmethod
-  def from_pyval(pyval):
-    """Returns a Shape that describes a tuple-tree of Numpy arrays."""
-
-    def convert(pyval):
-      if isinstance(pyval, tuple):
-        return Shape.tuple_shape(tuple(convert(elt) for elt in pyval))
-      else:
-        pyval = require_numpy_array_layout(pyval)
-        return Shape.array_shape(pyval.dtype, np.shape(pyval))
-
-    return convert(pyval)
-
-  def __init__(self, dimensions, dtype, minor_to_major=None):
-    assert isinstance(dimensions, tuple)
-    self._dimensions = dimensions
-    self._dtype = dtype
-    self._is_tuple = dtype == tuple
-    self._minor_to_major = minor_to_major
-    self._check_minor_to_major()
-
-  def __eq__(self, other):
-    # pylint: disable=protected-access
-    return (self._dtype == other._dtype and
-            self._dimensions == other._dimensions and
-            self._minor_to_major == other._minor_to_major)
-
-  def __ne__(self, other):
-    return not self == other
-
-  def __hash__(self):
-    return hash((self._dtype, self._dimensions, self._minor_to_major))
-
-  def __repr__(self):
-    return ('xla_client.Shape(_dtype={!r}, _dimensions={!r}, '
-            '_is_tuple={!r}, _minor_to_major={!r})').format(
-                self._dtype, self._dimensions, self._is_tuple,
-                self._minor_to_major)
-
-  def is_tuple(self):
-    return self._is_tuple
-
-  def is_array(self):
-    return not self._is_tuple
-
-  def tuple_shapes(self):
-    if not self.is_tuple():
-      raise ValueError('not a tuple shape')
-    return self._dimensions
-
-  def numpy_dtype(self):
-    """Like element_type(), but returns dtype('O') in case of a tuple shape."""
-    if self.is_tuple():
-      return np.dtype(np.object)
-    else:
-      return self.element_type()
-
-  def xla_element_type(self):
-    return DTYPE_TO_XLA_ELEMENT_TYPE[str(self.numpy_dtype())]
-
-  def element_type(self):
-    if not self.is_array():
-      raise ValueError('not an array shape')
-    return self._dtype
-
-  def dimensions(self):
-    if not self.is_array():
-      raise ValueError('not an array shape')
-    return self._dimensions
-
-  def rank(self):
-    return len(self.dimensions())
-
-  def minor_to_major(self):
-    return self._minor_to_major
-
-  def map_leaves(self, f):
-    """Map f over each leaf-level array subshape.
-
-    Args:
-      f: The function to apply. Whenever f returns None, the identity is applied
-        instead.
+    Arguments:
+      pyvals_and_devices: a list of `(pyval, device)` pairs, where `pyval` is a
+        Python value to copy (e.g., a NumPy array), and `device` is an integer
+        device ordinal.
+      backend: a Backend object, or `None` to use the default local backend.
 
     Returns:
-      A new Shape with the mapped leaves.
+      A list of `Buffer` objects corresponding to `pyvals_and_devices`.
     """
-    if self.is_tuple():
-      children = tuple(child.map_leaves(f) for child in self.tuple_shapes())
-      return Shape.tuple_shape(children)
+    backend = backend or get_local_backend()
+    pyvals_and_devices = [(require_numpy_array_layout(pyval), device)
+                          for pyval, device in pyvals_and_devices]
+    return backend.buffers_from_pyvals(pyvals_and_devices)
+
+  @staticmethod
+  def make_tuple(buffers, backend=None, device=0):
+    backend = backend or get_local_backend()
+    return backend.make_tuple(buffers, device_ordinal=device)
+
+  # Buffer is not an instantiable type and exists only for its static methods.
+  # The underlying buffer objects are C++ object with the following
+  # API:
+  # def to_py(self):
+  # def shape(self) -> Shape:
+  # def device(self) -> int:
+  # def delete(self):
+  # def destructure(self) -> [Buffer]
+  # def is_deleted(self) -> bool:
+  #
+  # TODO(phawkins): remove Buffer and its static methods completely, have
+  # clients call methods on Backend to create buffers.
+
+
+# TODO(phawkins): Alias for backward compatibility. Remove after JAX drops
+# compatibility with Jaxlib versions older than 0.1.13.
+LocalBuffer = Buffer
+
+
+def shape_from_pyval(pyval):
+  """Returns a Shape that describes a tuple-tree of Numpy arrays."""
+
+  def convert(pyval):
+    if isinstance(pyval, tuple):
+      return Shape.tuple_shape(tuple(convert(elt) for elt in pyval))
     else:
-      mapped = f(self)
-      return self if mapped is None else mapped
+      pyval = require_numpy_array_layout(pyval)
+      return Shape.array_shape(pyval.dtype, np.shape(pyval))
 
-  def _check_minor_to_major(self):
-    mtm = self._minor_to_major
-    if self.is_tuple():
-      assert mtm is None, self
-    if mtm is not None:
-      assert self.rank() == len(mtm), self
-      assert sorted(mtm) == list(range(len(mtm))), self
-
-  def update_minor_to_major(self, minor_to_major):
-    if not self.is_array():
-      raise ValueError('not an array shape')
-    if not isinstance(minor_to_major, tuple):
-      raise TypeError('minor_to_major must be a tuple')
-    updated = Shape.array_shape(self.element_type(), self.dimensions(),
-                                minor_to_major)
-    updated._check_minor_to_major()  # pylint: disable=protected-access
-    return updated
-
-  def with_major_to_minor_layout_if_absent(self):
-    """Returns a copy of a shape with missing layouts set to major-to-minor."""
-
-    def f(a):
-      if a.minor_to_major():
-        return None
-      return a.update_minor_to_major(tuple(xrange(a.rank() - 1, -1, -1)))
-
-    return self.map_leaves(f)
-
-  def serialize(self, proto):
-    """Serializes 'shape' into proto."""
-    if self.is_tuple():
-      proto.element_type = int(PrimitiveType.TUPLE)
-      for shape in self.tuple_shapes():
-        shape.serialize(proto.tuple_shapes.add())
-    else:
-      proto.element_type = int(self.xla_element_type())
-      proto.dimensions.extend(self.dimensions())
-      proto.is_dynamic_dimension.extend([False for _ in self.dimensions()])
-      if self.minor_to_major():
-        proto.layout.format = Format.DENSE
-        proto.layout.minor_to_major.extend(self.minor_to_major())
-
-  def as_xla_shape(self):
-    if self.is_tuple():
-      return _xla.Shape.Tuple([x.as_xla_shape() for x in self.tuple_shapes()])
-
-    return _xla.Shape.Array(self.xla_element_type(), self.dimensions(),
-                            self.minor_to_major())
-
-
-ProgramShape = collections.namedtuple('ProgramShape',
-                                      ('parameter_shapes', 'result_shape'))
-
-
-def _wrap_shape(xla_shape):
-  element_type = xla_shape.element_type()
-  if element_type == PrimitiveType.TUPLE:
-    shapes = tuple(_wrap_shape(sub) for sub in xla_shape.tuple_shapes())
-    return Shape.tuple_shape(shapes)
-  else:
-    dtype = XLA_ELEMENT_TYPE_TO_DTYPE[element_type]
-    return Shape.array_shape(dtype, xla_shape.dimensions())
-
-
-def _wrap_program_shape(program_shape):
-  return ProgramShape([_wrap_shape(arg) for arg in program_shape.Parameters()],
-                      _wrap_shape(program_shape.Result()))
+  return convert(pyval)
 
 
 def require_numpy_array_layout(value):
@@ -612,8 +455,7 @@ def transfer_from_outfeed(shape, device_ordinal=0):
   # TODO(phawkins): support non-default backends.
   backend = get_local_backend()
   return backend.client.TransferFromOutfeed(
-      shape.with_major_to_minor_layout_if_absent().as_xla_shape(),
-      device_ordinal)
+      shape.with_major_to_minor_layout_if_absent(), device_ordinal)
 
 
 class CompileOptions(object):
@@ -699,10 +541,10 @@ class Computation(object):
     return Executable(c, backend=backend)
 
   def GetProgramShape(self):
-    return _wrap_program_shape(self._c_computation.GetProgramShape())
+    return self._c_computation.GetProgramShape()
 
   def GetReturnValueShape(self):
-    return _wrap_shape(self._c_computation.GetProgramShape().Result())
+    return self._c_computation.GetProgramShape().result_shape()
 
 
 class Executable(object):
@@ -717,14 +559,12 @@ class Executable(object):
     """Returns a list containing the device ordinals for each replica."""
     return self._device_ordinals
 
-  def Execute(self, arguments=(), check_for_deleted_args=True):
+  def Execute(self, arguments=None, check_for_deleted_args=True):
     """Execute on one replica with Buffer arguments and return value."""
+    arguments = arguments or []
     if check_for_deleted_args and any(arg.is_deleted() for arg in arguments):
       raise ValueError('Executing with deleted local buffer argument')
-    raw_args = [arg.c_buffer for arg in arguments]
-    output_buffer = self._backend.execute(self._c_executable, raw_args)
-    return Buffer(
-        output_buffer, backend=self._backend, device=self._device_ordinals[0])
+    return self._backend.execute(self._c_executable, arguments)
 
   def ExecutePerReplica(self, arguments=None):
     """Execute on many replicas with Buffer arguments and return value.
@@ -753,23 +593,8 @@ class Executable(object):
               'Executing on device {} with argument from device {}'.format(
                   self._device_ordinals[replica], arg.device()))
 
-    # Pull out argument buffer handles
-    # pylint: disable=g-complex-comprehension
-    stripped_args = [
-        [arg.c_buffer for arg in replica_args] for replica_args in arguments
-    ]
-
     # Execute
-    output_buffers = self._backend.execute_replicated(self._c_executable,
-                                                      stripped_args)
-
-    # Wrap output handles in Buffer instances
-    return tuple(
-        Buffer(
-            output_buffer,
-            backend=self._backend,
-            device=self._device_ordinals[replica])
-        for replica, output_buffer in enumerate(output_buffers))
+    return self._backend.execute_replicated(self._c_executable, arguments)
 
   def ExecuteWithPythonValues(self, arguments=()):
     """Execute on one replica with Python values as arguments and output."""
@@ -877,7 +702,7 @@ class ComputationBuilder(object):
       return Computation(self._builder.Build(), backend=backend)
 
   def GetShape(self, operand):
-    return _wrap_shape(self._builder.GetShape(operand))
+    return self._builder.GetShape(operand)
 
   def SetOpMetadata(self, op_metadata):
     """Set metadata for operations that are about to be enqueued."""
@@ -896,9 +721,8 @@ class ComputationBuilder(object):
     Returns:
       An XlaOp.
     """
-    return ops.Infeed(
-        self._builder,
-        shape.with_major_to_minor_layout_if_absent().as_xla_shape())
+    return ops.Infeed(self._builder,
+                      shape.with_major_to_minor_layout_if_absent())
 
   def Outfeed(self, operand):
     """Enqueues an outfeed op onto the computation.
@@ -995,10 +819,9 @@ class ComputationBuilder(object):
     if parameter_num is None:
       parameter_num = next(self._parameter_numbering)
 
-    return ops.Parameter(
-        self._builder, parameter_num,
-        shape.with_major_to_minor_layout_if_absent().as_xla_shape(),
-        name.encode('utf8'))
+    return ops.Parameter(self._builder, parameter_num,
+                         shape.with_major_to_minor_layout_if_absent(),
+                         name.encode('utf8'))
 
   def ParameterFromNumpy(self, value, name=None, parameter_num=None):
     """Enqueues a Parameter op onto the computation.
@@ -1013,7 +836,7 @@ class ComputationBuilder(object):
       An XlaOp.
     """
     return self.ParameterWithShape(
-        Shape.from_pyval(value), name=name, parameter_num=parameter_num)
+        shape_from_pyval(value), name=name, parameter_num=parameter_num)
 
   def Iota(self, dtype, size):
     """Enqueues an iota constant onto the computation.
@@ -1040,7 +863,7 @@ class ComputationBuilder(object):
       An XlaOp representing the added broadcasted iota constant.
     """
     element_type = DTYPE_TO_XLA_ELEMENT_TYPE[str(np.dtype(dtype))]
-    xla_shape = _xla.Shape.Array(element_type, shape, None)
+    xla_shape = _xla.Shape.array_shape(element_type, shape, None)
     return ops.Iota(self._builder, xla_shape, dimension)
 
   def Concatenate(self, operands, dimension):
@@ -1097,6 +920,24 @@ class ComputationBuilder(object):
       dimensions = tuple(range(ndim))
     return ops.Reshape(operand, dimensions, new_sizes)
 
+  def AllReduce(self, operand, computation, replica_groups=None):
+    """AllReduce op.
+
+    Args:
+      operand: XlaOp representing the input array
+      computation: a Computation object - binary reduction function.
+      replica_groups: optional, list of lists of ints encoding a partition of
+        the set {0, 1, ..., num_replicas} into equally-sized replica groups
+        within which the all-to-all is performed. If not supplied or None (the
+        default), all replicas belong to the same group.
+
+    Returns:
+      An XlaOp that represents the all-reduced result.
+    """
+    replica_groups_protos = _get_replica_groups_protos(replica_groups)
+    return ops.AllReduce(operand, computation.computation,
+                         replica_groups_protos, None)
+
   def AllToAll(self,
                operand,
                split_dimension,
@@ -1117,13 +958,7 @@ class ComputationBuilder(object):
     Returns:
       An XlaOp that represents the all-to-all concatenation.
     """
-    if replica_groups is None:
-      replica_groups_protos = []  # special value for XLA API
-    else:
-      replica_groups = list(replica_groups)
-      replica_groups_protos = [
-          _make_replica_group_proto(group) for group in replica_groups
-      ]
+    replica_groups_protos = _get_replica_groups_protos(replica_groups)
     if not replica_groups:
       split_count = 1
     else:
@@ -1146,13 +981,8 @@ class ComputationBuilder(object):
     Returns:
       An XlaOp that represents on each replica the sum of its group's values.
     """
-    if replica_groups is None:
-      replica_groups = []  # special value for XLA API
-    else:
-      replica_groups = [
-          _make_replica_group_proto(group) for group in replica_groups
-      ]
-    return ops.CrossReplicaSum(operand, replica_groups)
+    replica_groups_protos = _get_replica_groups_protos(replica_groups)
+    return ops.CrossReplicaSum(operand, replica_groups_protos)
 
   def Trans(self, operand):
     """Specialized matrix transpose op."""
@@ -1298,10 +1128,9 @@ class ComputationBuilder(object):
       An XlaOp representing the added custom call op.
     """
     opaque = opaque or b''
-    return ops.CustomCall(
-        self._builder, call_target_name, list(operands),
-        shape_with_layout.as_xla_shape(),
-        [s.as_xla_shape() for s in operand_shapes_with_layout], opaque)
+    return ops.CustomCall(self._builder, call_target_name,
+                          list(operands), shape_with_layout,
+                          list(operand_shapes_with_layout), opaque)
 
   def Map(self, operands, computation_to_apply, dimensions):
     """Enqueues a map operation onto the computation.
@@ -1389,7 +1218,7 @@ class ComputationBuilder(object):
       dims: A 1D array-like of nonnegative integers specifying the dimensions.
     Returns: a XlaOp to the generated array of F32 values.
     """
-    shape = _xla.Shape.Array(self.GetShape(mu).xla_element_type(), dims)
+    shape = _xla.Shape.array_shape(self.GetShape(mu).xla_element_type(), dims)
     return ops.RngNormal(mu, sigma, shape)
 
   def RngUniform(self, a, b, dims):
@@ -1406,7 +1235,7 @@ class ComputationBuilder(object):
     Returns: a XlaOp to the generated array of values with the same numeric type
       (F32, S32, or U32) as the arguments a and b.
     """
-    shape = _xla.Shape.Array(self.GetShape(a).xla_element_type(), dims)
+    shape = _xla.Shape.array_shape(self.GetShape(a).xla_element_type(), dims)
     return ops.RngUniform(a, b, shape)
 
   def While(self, cond, body, init):
@@ -1659,7 +1488,6 @@ class ComputationBuilder(object):
 
 FftType = _xla.FftType
 
-
 _UNARY_OPS = [
     'Not',
     'Clz',
@@ -1732,6 +1560,7 @@ _OTHER_OPS = [
     'Cholesky',
     'Clamp',
     'Collapse',
+    'CollectivePermute',
     'ConvertElementType',
     'Dot',
     'Gather',
@@ -1885,3 +1714,14 @@ def _make_replica_group_proto(replica_group):
   replica_group_proto = ReplicaGroup()
   replica_group_proto.replica_ids.extend(replica_group)
   return replica_group_proto
+
+
+def _get_replica_groups_protos(replica_groups):
+  if replica_groups is None:
+    replica_groups_protos = []  # special value for XLA API
+  else:
+    replica_groups = list(replica_groups)
+    replica_groups_protos = [
+        _make_replica_group_proto(group) for group in replica_groups
+    ]
+  return replica_groups_protos
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index 0f268f037f0..b08089bcda8 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -315,10 +315,11 @@ class ComputationsWithConstantsTest(ComputationTest):
     c.CustomCall(
         b"test_subtract_f32",
         operands=(c.ConstantF32Scalar(1.25), c.ConstantF32Scalar(0.5)),
-        shape_with_layout=xla_client.Shape.array_shape(np.float32, (), ()),
+        shape_with_layout=xla_client.Shape.array_shape(
+            np.dtype(np.float32), (), ()),
         operand_shapes_with_layout=(
-            xla_client.Shape.array_shape(np.float32, (), ()),
-            xla_client.Shape.array_shape(np.float32, (), ()),
+            xla_client.Shape.array_shape(np.dtype(np.float32), (), ()),
+            xla_client.Shape.array_shape(np.dtype(np.float32), (), ()),
         ))
     self._ExecuteAndCompareClose(c, expected=0.75)
 
@@ -1745,7 +1746,7 @@ class EmbeddedComputationsTest(ComputationTest):
   def testInfeedS32Values(self):
     to_infeed = NumpyArrayS32([1, 2, 3, 4])
     c = self._NewComputation()
-    c.Infeed(xla_client.Shape.from_pyval(to_infeed[0]))
+    c.Infeed(xla_client.shape_from_pyval(to_infeed[0]))
     compiled_c = c.Build().Compile()
     for item in to_infeed:
       xla_client.transfer_to_infeed(item)
@@ -1757,7 +1758,7 @@ class EmbeddedComputationsTest(ComputationTest):
   def testInfeedThenOutfeedS32(self):
     to_round_trip = NumpyArrayS32([1, 2, 3, 4])
     c = self._NewComputation()
-    x = c.Infeed(xla_client.Shape.from_pyval(to_round_trip[0]))
+    x = c.Infeed(xla_client.shape_from_pyval(to_round_trip[0]))
     c.Outfeed(x)
 
     compiled_c = c.Build().Compile()
@@ -1767,7 +1768,7 @@ class EmbeddedComputationsTest(ComputationTest):
       execution.start()
       xla_client.transfer_to_infeed(want)
       got = xla_client.transfer_from_outfeed(
-          xla_client.Shape.from_pyval(to_round_trip[0]))
+          xla_client.shape_from_pyval(to_round_trip[0]))
       execution.join()
       self.assertEqual(want, got)
 
@@ -1803,7 +1804,9 @@ class ErrorTest(ComputationTest):
     c.ClearOpMetadata()
 
     options = xla_client.CompileOptions()
-    options.argument_layouts = [xla_client.Shape.array_shape(np.float32, [])]
+    options.argument_layouts = [
+        xla_client.Shape.array_shape(np.dtype(np.float32), [])
+    ]
 
     def TestFun():
       return c.Build().Compile(compile_options=options)
diff --git a/tensorflow/compiler/xla/python/xrt.cc b/tensorflow/compiler/xla/python/xrt.cc
index 94037172faa..b6f2495e115 100644
--- a/tensorflow/compiler/xla/python/xrt.cc
+++ b/tensorflow/compiler/xla/python/xrt.cc
@@ -84,9 +84,9 @@ void AddXrtSubmodule(py::module* module) {
       .def_property_readonly("tf_device_ids", &XrtContext::tf_device_ids);
 
   py::class_<XrtBuffer, std::shared_ptr<XrtBuffer>>(m, "XrtBuffer")
-      .def_static("FromLiteral", &XrtBuffer::FromLiteral)
-      .def_static("MakeTuple", &XrtBuffer::MakeTuple)
-      .def("ToPython",
+      .def_static("from_literal", &XrtBuffer::FromLiteral)
+      .def_static("make_tuple", &XrtBuffer::MakeTuple)
+      .def("to_py",
            [](std::shared_ptr<XrtBuffer> buffer) -> xla::StatusOr<py::object> {
              auto literal = absl::make_unique<xla::Literal>();
              {
@@ -95,8 +95,10 @@ void AddXrtSubmodule(py::module* module) {
              }
              return xla::LiteralToPython(std::move(literal));
            })
-      .def("Delete", &XrtBuffer::Delete)
-      .def("DestructureTuple", &XrtBuffer::DestructureTuple);
+      .def("delete", &XrtBuffer::Delete)
+      .def("destructure", &XrtBuffer::DestructureTuple)
+      .def("is_deleted",
+           [](const XrtBuffer& buffer) { return !buffer.handle().valid(); });
 
   py::class_<XrtExecutable, std::shared_ptr<XrtExecutable>>(m, "XrtExecutable")
       .def_static("Compile",
diff --git a/tensorflow/compiler/xla/python/xrt.py b/tensorflow/compiler/xla/python/xrt.py
index 32ca3d6c641..7158d176e95 100644
--- a/tensorflow/compiler/xla/python/xrt.py
+++ b/tensorflow/compiler/xla/python/xrt.py
@@ -31,13 +31,6 @@ from tensorflow.compiler.xla.python import xla_extension as _xla
 # pylint: enable=g-direct-tensorflow-import
 
 
-def _make_xla_shape(shape):
-  if shape.is_tuple():
-    return _xla.Shape.Tuple([_make_xla_shape(s) for s in shape.tuple_shapes()])
-  return _xla.Shape.Array(shape.xla_element_type(), shape.dimensions(),
-                          shape.minor_to_major())
-
-
 def get_tf_context(target, worker):
   """Returns a TensorFlow RPC client object.
 
@@ -60,7 +53,8 @@ class XrtBackend(xla_client.Backend):
     tf_device_type: the type of TensorFlow device to use for XRT (e.g. `"TPU"`).
   """
 
-  def __init__(self, tf_context, tf_device_type):
+  def __init__(self, tf_context, tf_device_type, platform="tpu"):
+    super(XrtBackend, self).__init__(platform)
     self.tf_device_type = tf_device_type
 
     self.context = _xla.xrt.XrtContext.Create(tf_context, tf_device_type)
@@ -69,30 +63,23 @@ class XrtBackend(xla_client.Backend):
     return self.context.DeviceCount()
 
   def buffer_from_pyval(self, pyval, device=0):
-    return _xla.xrt.XrtBuffer.FromLiteral(self.context, device, pyval)
-
-  def delete_buffer(self, c_buffer):
-    c_buffer.Delete()
-
-  def destructure_tuple(self, c_buffer):
-    return c_buffer.DestructureTuple()
+    return _xla.xrt.XrtBuffer.from_literal(self.context, device, pyval)
 
   def make_tuple(self, buffers, device_ordinal):
-    return _xla.xrt.XrtBuffer.MakeTuple(self.context, buffers)
+    return _xla.xrt.XrtBuffer.make_tuple(self.context, buffers)
 
   def compile(self, computation, compile_options):
     # pylint: disable=protected-access
-    program_shape = xla_client._wrap_program_shape(
-        computation.GetProgramShape())
+    program_shape = computation.GetProgramShape()
     # pylint: enable=protected-access
     proto = computation.GetSerializedProto()
     # TODO(phawkins): use the layouts in compile_options.
     arg_shapes = [
-        _make_xla_shape(shape.with_major_to_minor_layout_if_absent())
-        for shape in program_shape.parameter_shapes
+        shape.with_major_to_minor_layout_if_absent()
+        for shape in program_shape.parameter_shapes()
     ]
-    result_shape = _make_xla_shape(
-        program_shape.result_shape.with_major_to_minor_layout_if_absent())
+    result_shape = (
+        program_shape.result_shape().with_major_to_minor_layout_if_absent())
     device_assignment = _xla.xrt.AssignDevices(compile_options.num_replicas, 1)
     return _xla.xrt.XrtExecutable.Compile(self.context, proto, arg_shapes,
                                           result_shape, device_assignment)
diff --git a/tensorflow/compiler/xla/python/xrt_test.py b/tensorflow/compiler/xla/python/xrt_test.py
index ab4e663a62b..8e0691f8a7a 100644
--- a/tensorflow/compiler/xla/python/xrt_test.py
+++ b/tensorflow/compiler/xla/python/xrt_test.py
@@ -48,7 +48,7 @@ class XrtBackendTest(test.TestCase):
     b = np.arange(10)
 
     c = BuildAddAndScaleComputation(
-        xla_client.Shape.from_pyval(a), xla_client.Shape.from_pyval(b))
+        xla_client.shape_from_pyval(a), xla_client.shape_from_pyval(b))
 
     executable = c.Compile(backend=backend)
     output = executable.ExecuteWithPythonValues((a, b))
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 7e43f64b4ba..1e7a924e350 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -18,6 +18,9 @@ package_group(
     includes = [
         "//tensorflow/compiler/xla:friends",
     ],
+    packages = [
+        "//learning/brain/experimental/tf_runtime/...",
+    ],
 )
 
 xla_proto_library(
@@ -434,10 +437,10 @@ tf_cc_test(
     srcs = ["pattern_matcher_test.cc"],
     deps = [
         ":hlo",
+        ":hlo_parser",
         ":pattern_matcher",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
         "@com_google_absl//absl/strings",
@@ -505,8 +508,8 @@ cc_library(
     hdrs = ["hlo_matchers.h"],
     deps = [
         ":hlo",
+        ":hlo_parser",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
@@ -549,13 +552,13 @@ tf_cc_test(
     srcs = ["hlo_sharding_test.cc"],
     deps = [
         ":hlo",
+        ":hlo_parser",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -583,6 +586,7 @@ tf_cc_test(
     srcs = ["call_graph_test.cc"],
     deps = [
         ":call_graph",
+        ":hlo",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -590,7 +594,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
@@ -653,6 +656,7 @@ tf_cc_test(
     deps = [
         ":call_graph",
         ":flatten_call_graph",
+        ":hlo",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -660,7 +664,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
@@ -691,7 +694,6 @@ cc_library(
     deps = [
         ":compiler",
         ":computation_placer",
-        ":device_memory_allocator",
         ":platform_util",
         ":stream_pool",
         ":transfer_manager",
@@ -701,6 +703,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "//third_party/eigen3",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
@@ -721,7 +724,6 @@ cc_library(
         ":compiler",
         ":computation_layout",
         ":computation_placer",
-        ":device_memory_allocator",
         ":dump",
         ":dynamic_dimension_inference",
         ":executable",
@@ -751,6 +753,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:ptr_util",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -767,7 +770,6 @@ cc_library(
         ":backend",
         ":compiler",
         ":computation_layout",
-        ":device_memory_allocator",
         ":executable",
         ":hlo",
         ":hlo_execution_profile",
@@ -787,6 +789,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -855,7 +858,6 @@ cc_library(
     srcs = ["shaped_buffer.cc"],
     hdrs = ["shaped_buffer.h"],
     deps = [
-        ":device_memory_allocator",
         "//tensorflow/compiler/xla:shape_tree",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -865,6 +867,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -878,7 +881,6 @@ tf_cc_test(
     srcs = ["shaped_buffer_test.cc"],
     deps = [
         ":cpu_plugin",
-        ":device_memory_allocator",
         ":platform_util",
         ":shaped_buffer",
         "//tensorflow/compiler/xla:shape_util",
@@ -888,6 +890,7 @@ tf_cc_test(
         "//tensorflow/core:ptr_util",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -901,7 +904,6 @@ cc_library(
     ],
     deps = [
         ":computation_layout",
-        ":device_memory_allocator",
         ":dump",
         ":hlo",
         ":hlo_execution_profile",
@@ -922,6 +924,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -988,7 +991,6 @@ cc_library(
     hdrs = ["allocation_tracker.h"],
     deps = [
         ":backend",
-        ":device_memory_allocator",
         ":transfer_manager",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -997,6 +999,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -1156,6 +1159,7 @@ tf_cc_test(
         ":hlo",
         ":hlo_memory_scheduler",
         ":hlo_ordering",
+        ":hlo_parser",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
@@ -1163,7 +1167,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -1205,10 +1208,10 @@ tf_cc_test(
         ":hlo_dataflow_analysis",
         ":hlo_memory_scheduler",
         ":hlo_ordering",
+        ":hlo_parser",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
@@ -1455,8 +1458,8 @@ tf_cc_test(
     srcs = ["instruction_fusion_test.cc"],
     deps = [
         ":hlo_matchers",
+        ":hlo_parser",
         ":instruction_fusion",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -1467,11 +1470,11 @@ cc_library(
     srcs = ["multi_output_fusion.cc"],
     hdrs = ["multi_output_fusion.h"],
     deps = [
+        ":hlo",
+        ":hlo_pass",
         ":hlo_reachability",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -1663,6 +1666,7 @@ cc_library(
         ":hlo_pass",
         ":hlo_query",
         ":pattern_matcher",
+        "//tensorflow/compiler/xla:comparison_util",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
@@ -1788,8 +1792,8 @@ tf_cc_test(
     srcs = ["gather_expander_test.cc"],
     deps = [
         ":gather_expander",
+        ":hlo_parser",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:test_macros_header",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
     ],
@@ -1887,9 +1891,9 @@ tf_cc_test(
     name = "while_loop_analysis_test",
     srcs = ["while_loop_analysis_test.cc"],
     deps = [
+        ":hlo_parser",
         ":while_loop_analysis",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
@@ -2294,7 +2298,7 @@ tf_cc_test(
         ":cpu_plugin",
         ":hlo_cost_analysis",
         ":hlo_execution_profile",
-        "//tensorflow/compiler/xla/service:hlo_parser",
+        ":hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -2307,14 +2311,14 @@ tf_cc_test(
     srcs = ["hlo_computation_test.cc"],
     deps = [
         ":hlo",
+        ":hlo_matchers",
+        ":hlo_parser",
         ":pattern_matcher",
         ":pattern_matcher_gmock",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla/service:hlo_matchers",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -2519,13 +2523,13 @@ tf_cc_test(
     deps = [
         ":hlo",
         ":hlo_liveness_analysis",
+        ":hlo_parser",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -2909,12 +2913,12 @@ tf_cc_test(
     deps = [
         ":hlo",
         ":hlo_module_dce",
+        ":hlo_parser",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -3040,12 +3044,12 @@ tf_cc_test(
         ":hlo",
         ":hlo_cse",
         ":hlo_matchers",
+        ":hlo_parser",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -3229,27 +3233,6 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "device_memory_allocator",
-    srcs = [
-        "device_memory_allocator.cc",
-        "owning_device_memory.cc",
-    ],
-    hdrs = [
-        "device_memory_allocator.h",
-        "owning_device_memory.h",
-    ],
-    deps = [
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
 cc_library(
     name = "maybe_owning_device_memory",
     srcs = [
@@ -3259,7 +3242,7 @@ cc_library(
         "maybe_owning_device_memory.h",
     ],
     deps = [
-        ":device_memory_allocator",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:variant",
     ],
@@ -3302,10 +3285,10 @@ xla_test(
         "gpu",
     ],
     deps = [
+        ":hlo_parser",
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -3428,6 +3411,7 @@ tf_cc_test(
     deps = [
         ":hlo",
         ":hlo_matchers",
+        ":hlo_parser",
         ":shape_inference",
         ":transpose_folding",
         "//tensorflow/compiler/xla:literal",
@@ -3436,7 +3420,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service/gpu:ir_emission_utils",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -3679,10 +3662,10 @@ tf_cc_test(
     name = "tuple_util_test",
     srcs = ["tuple_util_test.cc"],
     deps = [
+        ":hlo_matchers",
+        ":hlo_parser",
         ":tuple_util",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/service:hlo_matchers",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
 )
@@ -3708,11 +3691,11 @@ tf_cc_test(
     name = "while_util_test",
     srcs = ["while_util_test.cc"],
     deps = [
+        ":hlo_matchers",
+        ":hlo_parser",
         ":while_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:hlo_matchers",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/algorithm:container",
     ],
@@ -3743,9 +3726,9 @@ tf_cc_test(
     srcs = ["while_loop_invariant_code_motion_test.cc"],
     deps = [
         ":hlo_matchers",
+        ":hlo_parser",
         ":while_loop_invariant_code_motion",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:test",
     ],
@@ -3771,9 +3754,9 @@ tf_cc_test(
     srcs = ["while_loop_constant_sinking_test.cc"],
     deps = [
         ":hlo_matchers",
+        ":hlo_parser",
         ":while_loop_constant_sinking",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:test",
     ],
@@ -3973,6 +3956,8 @@ cc_library(
     hdrs = ["ar_crs_combiner.h"],
     deps = [
         ":call_graph",
+        ":hlo",
+        ":hlo_pass",
         ":pattern_matcher",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
@@ -3980,8 +3965,6 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_pass",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
     ],
@@ -4005,11 +3988,11 @@ cc_library(
     srcs = ["dynamic_index_splitter.cc"],
     hdrs = ["dynamic_index_splitter.h"],
     deps = [
+        ":hlo",
         ":hlo_casting_utils",
+        ":hlo_pass",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_pass",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
@@ -4124,6 +4107,13 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "custom_call_target_registry",
+    srcs = ["custom_call_target_registry.cc"],
+    hdrs = ["custom_call_target_registry.h"],
+    visibility = ["//visibility:public"],
+)
+
 tf_cc_test(
     name = "slice_sinker_test",
     srcs = ["slice_sinker_test.cc"],
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 649ee116b4d..2441e64f3d0 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/comparison_util.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
@@ -183,6 +184,8 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
 
   Status HandleBroadcast(HloInstruction* broadcast) override;
 
+  Status HandleCompare(HloInstruction* compare) override;
+
   Status HandleConcatenate(HloInstruction* concatenate) override;
 
   Status HandleConstant(HloInstruction* constant) override;
@@ -2213,6 +2216,49 @@ Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
   return Status::OK();
 }
 
+Status AlgebraicSimplifierVisitor::HandleCompare(HloInstruction* compare) {
+  HloInstruction* lhs;
+  HloInstruction* rhs;
+  CHECK(Match(compare, m::Compare(m::Op(&lhs), m::Op(&rhs))));
+
+  auto replace_with_pred_broadcast = [&](bool value) {
+    return ReplaceWithNewInstruction(
+        compare,
+        HloInstruction::CreateBroadcast(
+            compare->shape(),
+            computation_->AddInstruction(
+                HloInstruction::CreateConstant(LiteralUtil::CreateR0(value))),
+            {}));
+  };
+  if (compare->comparison_direction() == ComparisonDirection::kLt &&
+      lhs->opcode() == HloOpcode::kIota && IsAll(rhs, 0)) {
+    return replace_with_pred_broadcast(false);
+  } else if (compare->comparison_direction() == ComparisonDirection::kGt &&
+             IsAll(lhs, 0) && rhs->opcode() == HloOpcode::kIota) {
+    return replace_with_pred_broadcast(false);
+  } else if (compare->comparison_direction() == ComparisonDirection::kGe &&
+             lhs->opcode() == HloOpcode::kIota && IsAll(rhs, 0)) {
+    return replace_with_pred_broadcast(true);
+  } else if (compare->comparison_direction() == ComparisonDirection::kLe &&
+             IsAll(lhs, 0) && rhs->opcode() == HloOpcode::kIota) {
+    return replace_with_pred_broadcast(true);
+  }
+  if (lhs == rhs &&
+      primitive_util::IsIntegralType(lhs->shape().element_type())) {
+    switch (compare->comparison_direction()) {
+      case ComparisonDirection::kGt:
+      case ComparisonDirection::kLt:
+      case ComparisonDirection::kNe:
+        return replace_with_pred_broadcast(false);
+      case ComparisonDirection::kEq:
+      case ComparisonDirection::kGe:
+      case ComparisonDirection::kLe:
+        return replace_with_pred_broadcast(true);
+    }
+  }
+  return Status::OK();
+}
+
 // A conversion to the same element type as the operand is a nop and can be
 // removed.  A conversion of a constant can be simplified by making a new
 // constant.
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 6a1e71ea430..fee95ae7e44 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -5372,21 +5372,54 @@ TEST_F(AlgebraicSimplifierTest, DotContractingReorder_SizeOneDims) {
   EXPECT_THAT(transpose->dimensions(), ElementsAre(0, 2, 1, 3));
 }
 
+// This test exposes a real bug: It tries to read an out-of-bounds array index
+// from within ComposePermutations().  TODO(b/132330723): Fix this.
 TEST_F(AlgebraicSimplifierTest,
-       DotContractingReorder_NoChangeInContractingDimsOrder) {
+       DISABLED_DotContractingReorder_NoChangeInContractingDimsOrder) {
   // No optimization opportunity here because the transpose does not reorder the
   // contracting dims.
   const char* kModuleStr = R"(
-    param = f32[2,5,1,3] parameter(0)
-    transpose = f32[1,5,2,3] transpose(param), dimensions={2,1,0,3}
-    reshape = f32[5,6] reshape(transpose)
-    constant = f32[6,4] constant({{1,2,3,4},{1,2,3,4},{1,2,3,4},{1,2,3,4},{1,2,3,4},{1,2,3,4}})
-    ROOT dot = f32[5,4] dot(reshape, constant),
-      lhs_contracting_dims={1}, rhs_contracting_dims={0}}
+    HloModule m
+    test {
+      param = f32[2,5,1,3] parameter(0)
+      transpose = f32[1,5,2,3] transpose(param), dimensions={2,1,0,3}
+      reshape = f32[5,6] reshape(transpose)
+      constant = f32[6,4] constant({{1,2,3,4},{1,2,3,4},{1,2,3,4},{1,2,3,4},{1,2,3,4},{1,2,3,4}})
+      ROOT dot = f32[5,4] dot(reshape, constant),
+        lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    }
   )";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
   ASSERT_FALSE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
 }
 
+TEST_F(AlgebraicSimplifierTest, CompareIota) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      zero = s32[] constant(0)
+      iota = s32[128] iota(), iota_dimension=0
+      broad = s32[128] broadcast(zero), dimensions={}
+      ROOT compare = pred[128] compare(iota, broad), direction=LT
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Broadcast(m::ConstantScalar(false))));
+}
+
+TEST_F(AlgebraicSimplifierTest, CompareSame) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      param = s32[123] parameter(0)
+      ROOT compare = pred[123] compare(param, param), direction=GE
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Broadcast(m::ConstantScalar(true))));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc
index 6cb0e985e57..ea56c75b2f2 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.cc
+++ b/tensorflow/compiler/xla/service/allocation_tracker.cc
@@ -20,13 +20,13 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 
@@ -221,8 +221,8 @@ void AllocationTracker::AddAllocationOrIncrementRefCount(
   auto it = allocation_map.find(device_memory.opaque());
   if (it == allocation_map.end()) {
     allocation_map[device_memory.opaque()] = {
-        OwningDeviceMemory(device_memory, device_ordinal,
-                           backend_->memory_allocator()),
+        se::OwningDeviceMemory(device_memory, device_ordinal,
+                               backend_->memory_allocator()),
         /*ref_count=*/1};
   } else {
     it->second.ref_count++;
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.h b/tensorflow/compiler/xla/service/allocation_tracker.h
index 98d1a302a9f..6e7f9fdfc13 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.h
+++ b/tensorflow/compiler/xla/service/allocation_tracker.h
@@ -77,7 +77,7 @@ class AllocationTracker {
   // Data structure encapsulating single memory allocation on the device.
   struct Allocation {
     // The pointer to this allocation.
-    OwningDeviceMemory device_memory;
+    se::OwningDeviceMemory device_memory;
 
     // This is the number of times this memory allocation is referred to by
     // registered data handles.
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner.cc b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
index 787fe1672ab..1ca2280fc28 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner.cc
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
@@ -107,44 +107,90 @@ absl::optional<HloInstruction*> ArCrsCombiner::WhileFromBodyParameter(
   return absl::nullopt;
 }
 
+absl::optional<HloInstruction*> ArCrsCombiner::ConditionalFromBodyParameter(
+    HloInstruction* instruction) {
+  CHECK_EQ(HloOpcode::kParameter, instruction->opcode());
+  HloComputation* computation = instruction->parent();
+  auto caller_instructions = call_graph_->GetComputationCallers(computation);
+  if (caller_instructions.size() == 1) {
+    auto caller_instruction = caller_instructions[0];
+    if (caller_instruction->opcode() == HloOpcode::kConditional) {
+      return caller_instruction;
+    }
+  }
+  return absl::nullopt;
+}
+
 std::vector<HloInstruction*> ArCrsCombiner::GetAllTuples(
     HloInstruction* instruction) {
-  if (instruction->opcode() == HloOpcode::kTuple) {
-    return {instruction};
-  }
-  if (instruction->opcode() == HloOpcode::kDomain) {
-    return GetAllTuples(instruction->operands()[0]);
-  }
-  if (instruction->opcode() == HloOpcode::kParameter) {
-    auto maybe_while = WhileFromBodyParameter(instruction);
-    if (!maybe_while) {
-      return {};
-    }
-    auto while_instr = *maybe_while;
-    auto init_tuples = GetAllTuples(while_instr->while_init());
-    auto body_tuples =
-        GetAllTuples(while_instr->while_body()->root_instruction());
-    if (init_tuples.empty() || body_tuples.empty()) {
-      return {};
-    }
-    init_tuples.insert(init_tuples.end(), body_tuples.begin(),
-                       body_tuples.end());
-    return init_tuples;
-  }
-  if (instruction->opcode() == HloOpcode::kGetTupleElement) {
-    std::vector<HloInstruction*> result_tuples;
-    for (auto tuple : GetAllTuples(instruction->operands()[0])) {
-      auto tmp_tuples =
-          GetAllTuples(tuple->mutable_operand(instruction->tuple_index()));
-      if (tmp_tuples.empty()) {
-        return {};
+  switch (instruction->opcode()) {
+    case HloOpcode::kTuple:
+      return {instruction};
+    case HloOpcode::kDomain:
+      return GetAllTuples(instruction->operands()[0]);
+    case HloOpcode::kParameter: {
+      auto maybe_while = WhileFromBodyParameter(instruction);
+      if (maybe_while) {
+        auto while_instr = *maybe_while;
+        auto init_tuples = GetAllTuples(while_instr->while_init());
+        auto body_tuples =
+            GetAllTuples(while_instr->while_body()->root_instruction());
+        if (init_tuples.empty() || body_tuples.empty()) {
+          return {};
+        }
+        init_tuples.insert(init_tuples.end(), body_tuples.begin(),
+                           body_tuples.end());
+        return init_tuples;
       }
-      result_tuples.insert(result_tuples.end(), tmp_tuples.begin(),
-                           tmp_tuples.end());
+      auto maybe_conditional = ConditionalFromBodyParameter(instruction);
+      if (maybe_conditional) {
+        auto cond_instr = *maybe_conditional;
+        std::vector<HloInstruction*> tuples;
+        for (int64 i = 0; i < cond_instr->branch_computations().size(); ++i) {
+          if (cond_instr->branch_computation(i)->parameter_instruction(0) ==
+              instruction) {
+            // If the same computation is used for more than one branch of the
+            // conditional, we collect the arguments that flow to the
+            // computation from all branches.
+            auto branch_tuples =
+                GetAllTuples(cond_instr->mutable_operand(i + 1));
+            if (branch_tuples.empty()) {
+              return {};
+            }
+            tuples.insert(tuples.end(), branch_tuples.begin(),
+                          branch_tuples.end());
+          }
+        }
+        return tuples;
+      }
+      return {};
     }
-    return result_tuples;
+    case HloOpcode::kGetTupleElement: {
+      std::vector<HloInstruction*> result_tuples;
+      for (auto tuple : GetAllTuples(instruction->operands()[0])) {
+        auto tmp_tuples =
+            GetAllTuples(tuple->mutable_operand(instruction->tuple_index()));
+        if (tmp_tuples.empty()) {
+          return {};
+        }
+        result_tuples.insert(result_tuples.end(), tmp_tuples.begin(),
+                             tmp_tuples.end());
+      }
+      return result_tuples;
+    }
+    case HloOpcode::kConditional: {
+      std::vector<HloInstruction*> result_tuples;
+      for (HloComputation* body : instruction->branch_computations()) {
+        if (body->root_instruction()->opcode() != HloOpcode::kTuple) {
+          return {};
+        }
+        result_tuples.push_back(body->root_instruction());
+      }
+      return result_tuples;
+    }
+    default:
+      return {};
   }
-  return {};
 }
 
 bool ArCrsCombiner::TupleElementsComputeSameValue(
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner.h b/tensorflow/compiler/xla/service/ar_crs_combiner.h
index 2ae556068fe..e5926c7de91 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner.h
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner.h
@@ -119,6 +119,12 @@ class ArCrsCombiner : public HloModulePass {
   absl::optional<HloInstruction*> WhileFromBodyParameter(
       HloInstruction* instruction);
 
+  // If the passed instruction is a parameter in one of the branch computations,
+  // and the branch body is only called by a single instruction, return the
+  // conditional instruction.
+  absl::optional<HloInstruction*> ConditionalFromBodyParameter(
+      HloInstruction* instruction);
+
   // Returns a vector of tuple instructions.
   // If all instructions that flow to "instruction" are tuples, return them.
   // Otherwise, return an empty vector.
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
index b972b1289b9..e972e3ce4be 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
@@ -1173,5 +1173,47 @@ ENTRY %entrycomp (p: bf16[]) -> (f32[], f32[]) {
   EXPECT_FALSE(changed);
 }
 
+TEST_F(ArCrsCombinerTest, SameValueTestConditional) {
+  const char* module_str = R"(
+HloModule foobar
+
+branch_true {
+  pt = (f32[2,4], f32[2,4]) parameter(0)
+  gte.0 = f32[2,4] get-tuple-element(pt), index=0
+  gte.1 = f32[2,4] get-tuple-element(pt), index=1
+  ROOT tuple.t = (f32[2,4], f32[2,4]) tuple(gte.1, gte.0)
+}
+
+branch_false {
+  pf = (f32[2,4], f32[2,4]) parameter(0)
+  gte.0 = f32[2,4] get-tuple-element(pf), index=0
+  gte.1 = f32[2,4] get-tuple-element(pf), index=1
+  add = f32[2,4] add(gte.1, gte.1)
+  ROOT tuple.f = (f32[2,4], f32[2,4]) tuple(gte.0, add)
+}
+
+ENTRY Parameters1.v4 {
+  constant = pred[] constant(true)
+  p = f32[2,4] parameter(0)
+  tuple = (f32[2,4], f32[2,4]) tuple(p, p)
+  ROOT conditional = (f32[2,4], f32[2,4]) conditional(constant, tuple, tuple), true_computation=branch_true, false_computation=branch_false
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto cond = module->entry_computation()->root_instruction();
+
+  auto branch_true = cond->branch_computation(0)->root_instruction();
+  auto t0 = branch_true->mutable_operand(0);
+  auto t1 = branch_true->mutable_operand(1);
+  EXPECT_TRUE(ArCrsCombiner::TestInstructionsComputeSameValue(t0, t1));
+
+  auto branch_false = cond->branch_computation(1)->root_instruction();
+  auto f0 = branch_false->mutable_operand(0);
+  auto f1 = branch_false->mutable_operand(1);
+  EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(f0, f1));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc
index 1528ec61354..d859f647ea0 100644
--- a/tensorflow/compiler/xla/service/backend.cc
+++ b/tensorflow/compiler/xla/service/backend.cc
@@ -134,7 +134,7 @@ Backend::Backend(se::Platform* platform, Compiler* compiler,
     }
   }
   // Create a memory allocator for the valid stream executors.
-  memory_allocator_ = absl::make_unique<StreamExecutorMemoryAllocator>(
+  memory_allocator_ = absl::make_unique<se::StreamExecutorMemoryAllocator>(
       platform, stream_executors);
   CHECK(!stream_executors_.empty())
       << "Service found no devices for backend " << platform_->Name() << '.';
diff --git a/tensorflow/compiler/xla/service/backend.h b/tensorflow/compiler/xla/service/backend.h
index e7f29a044b9..79fdeb2b0bc 100644
--- a/tensorflow/compiler/xla/service/backend.h
+++ b/tensorflow/compiler/xla/service/backend.h
@@ -27,7 +27,6 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/stream_pool.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -35,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace Eigen {
 struct ThreadPoolDevice;
@@ -88,7 +88,7 @@ class Backend {
   // Accessors for the various objects.
   se::Platform* platform() const { return platform_; }
   Compiler* compiler() const { return compiler_; }
-  DeviceMemoryAllocator* memory_allocator() const {
+  se::DeviceMemoryAllocator* memory_allocator() const {
     return memory_allocator_.get();
   }
   TransferManager* transfer_manager() const { return transfer_manager_; }
@@ -179,7 +179,7 @@ class Backend {
       stream_pools_ GUARDED_BY(mu_);
 
   // The default memory allocator to use.
-  std::unique_ptr<StreamExecutorMemoryAllocator> memory_allocator_;
+  std::unique_ptr<se::StreamExecutorMemoryAllocator> memory_allocator_;
 
   // For the CPU backend, an Eigen threadpool device for use by Eigen code.
   struct IntraOpThreadPool;
diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h
index 9b483bd97e9..631a7dd7e6a 100644
--- a/tensorflow/compiler/xla/service/compiler.h
+++ b/tensorflow/compiler/xla/service/compiler.h
@@ -75,8 +75,10 @@ class AotCompilationOptions {
 
   // Optional allocator that may be used for allocating temp space on the device
   // during compilation.
-  DeviceMemoryAllocator* device_allocator() const { return device_allocator_; }
-  void set_device_allocator(DeviceMemoryAllocator* device_allocator) {
+  se::DeviceMemoryAllocator* device_allocator() const {
+    return device_allocator_;
+  }
+  void set_device_allocator(se::DeviceMemoryAllocator* device_allocator) {
     device_allocator_ = device_allocator;
   }
 
@@ -98,7 +100,7 @@ class AotCompilationOptions {
   AotCompilationOptions();
 
  private:
-  DeviceMemoryAllocator* device_allocator_ = nullptr;
+  se::DeviceMemoryAllocator* device_allocator_ = nullptr;
   DebugOptions debug_options_;
   absl::optional<DeviceAssignment> static_device_assignment_;
 };
@@ -147,14 +149,14 @@ class Compiler {
   // allocated should be deallocated before this function returns.
   virtual StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
-      DeviceMemoryAllocator* device_allocator) = 0;
+      se::DeviceMemoryAllocator* device_allocator) = 0;
 
   // Optimizes a HLO module group, a set of module which runs concurrently on
   // multiple devices potentially communicating data between the modules.
   virtual Status RunHloPassesOnModuleGroup(
       HloModuleGroup* module_group,
       absl::Span<se::StreamExecutor* const> executors,
-      DeviceMemoryAllocator* device_allocator) = 0;
+      se::DeviceMemoryAllocator* device_allocator) = 0;
 
   // Compiles the HLO module for execution on a device given by the executor,
   // and returns an executable object or an error status. No HLO passes are
@@ -168,7 +170,7 @@ class Compiler {
   // device_allocator is optional; see RunHloPasses.
   virtual StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
-      DeviceMemoryAllocator* device_allocator) = 0;
+      se::DeviceMemoryAllocator* device_allocator) = 0;
 
   // Compiles a set of HLO modules that can run in parallel, potentially
   // communicating data between the modules.
@@ -176,7 +178,7 @@ class Compiler {
   RunBackendOnModuleGroup(
       std::unique_ptr<HloModuleGroup> module_group,
       std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-      DeviceMemoryAllocator* device_allocator) = 0;
+      se::DeviceMemoryAllocator* device_allocator) = 0;
 
   // Compiles a set of HLO modules that can run in parallel, potentially
   // communicating data between the modules, and returns a corresponding
@@ -189,7 +191,7 @@ class Compiler {
   virtual StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::unique_ptr<HloModuleGroup> module_group,
       std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-      DeviceMemoryAllocator* device_allocator) = 0;
+      se::DeviceMemoryAllocator* device_allocator) = 0;
 
   // Returns the backend configurations that the backend will consider for the
   // given HLO. Returns no configurations if the backend does not support
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 529ed121731..09f5c859af4 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -182,7 +182,6 @@ cc_library(
     deps = [
         ":compiler_functor",
         ":cpu_runtime",
-        ":custom_call_target_registry",
         ":disassembler",
         ":orc_jit_memory_mapper",
         ":runtime_fp16",
@@ -203,6 +202,7 @@ cc_library(
         "@llvm//:orc_jit",
         "@llvm//:support",
         "@llvm//:target",  # fixdeps: keep
+        "//tensorflow/compiler/xla/service:custom_call_target_registry",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
@@ -245,7 +245,6 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:computation_layout",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_execution_profile",
@@ -255,6 +254,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "//tensorflow/stream_executor/host:host_stream",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -946,17 +946,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "custom_call_target_registry",
-    srcs = [
-        "custom_call_target_registry.cc",
-    ],
-    hdrs = [
-        "custom_call_target_registry.h",
-    ],
-    visibility = ["//visibility:public"],
-)
-
 cc_library(
     name = "orc_jit_memory_mapper",
     srcs = ["orc_jit_memory_mapper.cc"],
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index cbebfb08f06..06ea1e2f8bd 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -537,7 +537,7 @@ Status CreateHloProfilingArtifacts(
 
 StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
     std::unique_ptr<HloModule> module, se::StreamExecutor* /*stream_exec*/,
-    DeviceMemoryAllocator* /*device_allocator*/) {
+    se::DeviceMemoryAllocator* /*device_allocator*/) {
   std::unique_ptr<llvm::TargetMachine> jit_target_machine =
       SimpleOrcJIT::InferTargetMachineForJIT(
           CompilerTargetOptions(module->config()),
@@ -597,7 +597,7 @@ struct OrcJITPostCompilationHook {
 
 StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
     std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
-    DeviceMemoryAllocator* /*device_allocator*/) {
+    se::DeviceMemoryAllocator* /*device_allocator*/) {
   VLOG(1) << "Compiling: " << module->name();
   XLA_SCOPED_LOGGING_TIMER(
       absl::StrFormat("Compiling [%s] for CPU using JIT", module->name()));
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
index 8ff0fd5a5c5..dd15891f175 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
@@ -133,11 +133,11 @@ class CpuCompiler : public LLVMCompiler {
 
   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 23d0af34233..cc0f808569a 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -73,13 +73,13 @@ CpuExecutable::CpuExecutable(
 }
 
 StatusOr<std::pair<std::vector<se::DeviceMemoryBase>,
-                   std::vector<OwningDeviceMemory>>>
+                   std::vector<se::OwningDeviceMemory>>>
 CpuExecutable::CreateBufferTable(
-    DeviceMemoryAllocator* memory_allocator, int device_ordinal,
+    se::DeviceMemoryAllocator* memory_allocator, int device_ordinal,
     absl::Span<const ShapedBuffer* const> arguments) {
   std::vector<se::DeviceMemoryBase> unowning_buffers(
       assignment_->Allocations().size());
-  std::vector<OwningDeviceMemory> owning_buffers(
+  std::vector<se::OwningDeviceMemory> owning_buffers(
       assignment_->Allocations().size());
   VLOG(3) << "Allocating " << assignment_->Allocations().size()
           << " allocations for module " << module().name();
@@ -207,7 +207,7 @@ Status CpuExecutable::ExecuteComputeFunction(
 
 StatusOr<ScopedShapedBuffer> CpuExecutable::CreateResultShapedBuffer(
     const ServiceExecutableRunOptions* run_options,
-    absl::Span<OwningDeviceMemory> buffers) {
+    absl::Span<se::OwningDeviceMemory> buffers) {
   se::Stream* stream = run_options->stream();
   ScopedShapedBuffer result_buffer(
       /*on_host_shape=*/result_shape(),
@@ -216,7 +216,7 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::CreateResultShapedBuffer(
   const HloInputOutputAliasConfig& input_output_alias =
       module().input_output_alias_config();
 
-  // Move OwningDeviceMemory values which contain the array(s) of the result
+  // Move se::OwningDeviceMemory values which contain the array(s) of the result
   // into the respective location in ScopedShapedBuffer which is returned to the
   // caller.
   TF_RETURN_IF_ERROR(result_buffer.buffers().ForEachMutableElementWithStatus(
@@ -235,7 +235,7 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::CreateResultShapedBuffer(
             const BufferAllocation::Slice slice,
             this->assignment_->GetUniqueSlice(src, buffer_source->index()));
         const BufferAllocation::Index buffer_index = slice.index();
-        OwningDeviceMemory& buffer = buffers[buffer_index];
+        se::OwningDeviceMemory& buffer = buffers[buffer_index];
         if (!slice.allocation()->is_entry_computation_parameter()) {
           // If the buffer coming out of the result is from a parameter, the
           // owning buffer will be null, and that means the caller aliased some
@@ -297,8 +297,8 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteAsyncOnStreamImpl(
   auto* host_stream = dynamic_cast<se::host::HostStream*>(
       run_options->stream()->implementation());
   se::Stream* stream = run_options->stream();
-  DeviceMemoryAllocator* memory_allocator = run_options->allocator();
-  std::vector<OwningDeviceMemory> owning_buffers;
+  se::DeviceMemoryAllocator* memory_allocator = run_options->allocator();
+  std::vector<se::OwningDeviceMemory> owning_buffers;
   std::vector<se::DeviceMemoryBase> unowning_buffers;
   TF_ASSIGN_OR_RETURN(
       std::tie(unowning_buffers, owning_buffers),
@@ -326,7 +326,7 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteAsyncOnStreamImpl(
     CpuExecutable* executable;
     ServiceExecutableRunOptions run_options;
     std::vector<se::DeviceMemoryBase> unowning_buffers;
-    std::shared_ptr<std::vector<OwningDeviceMemory>> buffers;
+    std::shared_ptr<std::vector<se::OwningDeviceMemory>> buffers;
     HloExecutionProfile* hlo_execution_profile;
 
     void operator()() {
@@ -338,7 +338,7 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteAsyncOnStreamImpl(
   };
   host_stream->EnqueueTask(
       AsyncRunTask{this, *run_options, std::move(unowning_buffers),
-                   std::make_shared<std::vector<OwningDeviceMemory>>(
+                   std::make_shared<std::vector<se::OwningDeviceMemory>>(
                        std::move(owning_buffers)),
                    hlo_execution_profile});
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index 3b91b15ba9b..735a20749b9 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -25,7 +25,6 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -37,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 namespace cpu {
@@ -111,8 +111,9 @@ class CpuExecutable : public Executable {
   //    storage and the live-out buffer into which the computation writes it
   //    result.
   StatusOr<std::pair<std::vector<se::DeviceMemoryBase>,
-                     std::vector<OwningDeviceMemory>>>
-  CreateBufferTable(DeviceMemoryAllocator* memory_allocator, int device_ordinal,
+                     std::vector<se::OwningDeviceMemory>>>
+  CreateBufferTable(se::DeviceMemoryAllocator* memory_allocator,
+                    int device_ordinal,
                     absl::Span<const ShapedBuffer* const> arguments);
 
   // Calls the generated function performing the computation with the given
@@ -126,7 +127,7 @@ class CpuExecutable : public Executable {
   // The addresses are set according to buffer assignment.
   StatusOr<ScopedShapedBuffer> CreateResultShapedBuffer(
       const ServiceExecutableRunOptions* run_options,
-      absl::Span<OwningDeviceMemory> buffers);
+      absl::Span<se::OwningDeviceMemory> buffers);
 
   // Returns the points-to set of the root instruction of the entry
   // computation. Uses points-to analysis from buffer assignment.
diff --git a/tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h b/tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h
deleted file mode 100644
index 664125ecc95..00000000000
--- a/tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CUSTOM_CALL_TARGET_REGISTRY_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CUSTOM_CALL_TARGET_REGISTRY_H_
-
-// This file is depended on by kernels that have to build for mobile devices.
-// For this reason, we avoid relying on TensorFlow and instead only use the
-// standard C++ library.
-
-#include <mutex>  // NOLINT
-#include <string>
-#include <unordered_map>
-
-namespace xla {
-namespace cpu {
-
-// The CPU JIT compiler uses this registry to resolve symbolic CustomCall
-// targets; so when using the CPU JIT, CustomCall targets need to be registered
-// here with the symbol name used in the CustomCall.
-//
-// The XLA AOT compiler links using a standard offline linker; so when compiling
-// in AOT mode, you *also* need to make sure the name of the callee (presumably
-// implemented in C++) matches up with the symbolic name used in the CustomCall.
-//
-// We maintain the registry in both the JIT and the AOT cases for simplicity,
-// but we only use it when running in JIT mode.
-class CustomCallTargetRegistry {
- public:
-  static CustomCallTargetRegistry* Global();
-
-  void Register(const std::string& symbol, void* address);
-  void* Lookup(const std::string& symbol) const;
-
- private:
-  std::unordered_map<std::string, void*> registered_symbols_;
-  mutable std::mutex mu_;
-};
-
-class RegisterCustomCallTarget {
- public:
-  explicit RegisterCustomCallTarget(const std::string& name, void* address) {
-    CustomCallTargetRegistry::Global()->Register(name, address);
-  }
-};
-
-#define REGISTER_CUSTOM_CALL_CONCAT(a, b) a##b
-
-#define REGISTER_CUSTOM_CALL_TARGET_WITH_SYM_HELPER(symbol, address, counter) \
-  static ::xla::cpu::RegisterCustomCallTarget REGISTER_CUSTOM_CALL_CONCAT(    \
-      custom_call_target_register, counter)(symbol,                           \
-                                            reinterpret_cast<void*>(address))
-
-#define REGISTER_CUSTOM_CALL_TARGET_WITH_SYM(symbol, address) \
-  REGISTER_CUSTOM_CALL_TARGET_WITH_SYM_HELPER(symbol, address, __COUNTER__)
-
-#define REGISTER_CUSTOM_CALL_TARGET(function) \
-  REGISTER_CUSTOM_CALL_TARGET_WITH_SYM(#function, function)
-
-}  // namespace cpu
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CUSTOM_CALL_TARGET_REGISTRY_H_
diff --git a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
index 93ef51754d2..a4bb5f72297 100644
--- a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
@@ -119,13 +119,9 @@ llvm::Value* GenerateVF32Exp(llvm::IRBuilder<>* b, llvm::Value* input,
                              int32 vector_width) {
   VectorSupportLibrary vsl(F32, vector_width, b, "exp_f32");
 
-  // This implements the same polynomial approximation as implemented in Eigen3.
-
+  // This implements the same polynomial approximation as implemented in Cephes.
   const llvm::APFloat half = GetIeeeF32(0.5);
-  const llvm::APFloat one = GetIeeeF32(1.0);
-
-  const llvm::APFloat exp_hi = GetIeeeF32(88.3762626647950);
-  const llvm::APFloat exp_lo = GetIeeeF32(-88.3762626647949);
+  const llvm::APFloat one = GetIeeeF32(1);
 
   const llvm::APFloat cephes_LOG2EF = GetIeeeF32(1.44269504088896341);
   const llvm::APFloat cephes_exp_C1 = GetIeeeF32(0.693359375);
@@ -138,39 +134,79 @@ llvm::Value* GenerateVF32Exp(llvm::IRBuilder<>* b, llvm::Value* input,
   const llvm::APFloat cephes_exp_p4 = GetIeeeF32(1.6666665459E-1);
   const llvm::APFloat cephes_exp_p5 = GetIeeeF32(5.0000001201E-1);
 
-  llvm::Value* input_clamped =
-      vsl.Clamp(input, /*low=*/exp_lo, /*high=*/exp_hi);
-  llvm::Value* fx = vsl.Floor(vsl.MulAdd(input_clamped, cephes_LOG2EF, half));
-  llvm::Value* tmp = vsl.Mul(cephes_exp_C1, fx);
-  llvm::Value* z = vsl.Mul(cephes_exp_C2, fx);
-  llvm::Value* x = vsl.Sub(input_clamped, tmp);
-  x = vsl.Sub(x, z);
-  z = vsl.Mul(x, x);
+  // To compute e^input, we re-express it as
+  //
+  //   e^input = e^(a + b)
+  //           = e^(a + n log(2))
+  //           = e^a * 2^n.
+  //
+  // We choose n = floor(a * log(2) + 0.5), restricting the value of `a` to
+  // (-0.5, 0.5).  We then use a polynomial to compute e^a.
 
-  llvm::Value* y = vsl.MulAdd(x, cephes_exp_p0, cephes_exp_p1);
-  y = vsl.MulAdd(y, x, cephes_exp_p2);
-  y = vsl.MulAdd(y, x, cephes_exp_p3);
-  y = vsl.MulAdd(y, x, cephes_exp_p4);
-  y = vsl.MulAdd(y, x, cephes_exp_p5);
-  y = vsl.MulAdd(y, z, x);
-  y = vsl.Add(one, y);
+  // Restrict input to a small range, including some values that evaluate to
+  // +/- inf.  Our computations below aren't particularly sensitive to the exact
+  // choices here, so we choose values a bit larger/smaller than
+  //
+  //   log(F32_MAX) =       88.723...
+  //   log(F32_EPSILON) = -103.279....
+  //
+  input = vsl.Clamp(input, GetIeeeF32(-104), GetIeeeF32(88.8));
 
-  // VectorSupportLibrary (intentionally) can't juggle more than one type at a
-  // time so drop down to IRBuilder for this bit.
-  llvm::Value* vector_constant_0x7f =
-      b->CreateVectorSplat(vector_width, b->getInt32(0x7f));
-  llvm::Value* vector_constant_23 =
-      b->CreateVectorSplat(vector_width, b->getInt32(23));
-  llvm::Type* i32_vector_type =
-      llvm::VectorType::get(b->getInt32Ty(), vector_width);
-  // fx is clamped so we don't have to worry about it being out of range for
-  // i32.
-  llvm::Value* emm0 = b->CreateFPToSI(fx, i32_vector_type);
-  emm0 = b->CreateAdd(emm0, vector_constant_0x7f);
-  emm0 = b->CreateShl(emm0, vector_constant_23);
-  llvm::Value* emm0_f32 = b->CreateBitCast(emm0, vsl.vector_type());
+  llvm::Value* x = input;
+  llvm::Value* n = vsl.Floor(vsl.MulAdd(input, cephes_LOG2EF, half));
 
-  return vsl.Max(vsl.Mul(y, emm0_f32), input);
+  // When we eventually do the multiplication in e^a * 2^n, we need to handle
+  // the case when n > 127, the max fp32 exponent (so 2^n == inf) but e^a < 1
+  // (so e^a * 2^n != inf).  There's a similar problem for n < -126, the
+  // smallest fp32 exponent.
+  //
+  // A straightforward solution would be to detect n out of range and split it
+  // up, doing
+  //
+  //   e^a * 2^n = e^a * 2^(n1 + n2)
+  //             = (2^n1 * e^a) * 2^n2.
+  //
+  // But it turns out this approach is quite slow.  It's not clear why; our
+  // hypothesis is that the integer operations on the exponent `n` have nonlocal
+  // effects on the pipeline.
+  //
+  // The approach we use instead is to clamp n to [-126, 127] so 2^n doesn't
+  // over/underflow.  This causes `a` to be outside the range (-0.5, 0.5), which
+  // means that our polynomial for e^a will give a less-accurate result.  In
+  // practice this seems to work well enough; it passes our exhaustive tests,
+  // breaking only one result, and by one ulp (we return exp(88.7228394) =
+  // max-float but we should return inf).
+  n = vsl.Clamp(n, GetIeeeF32(-126), GetIeeeF32(127));
+
+  // Polynomial to compute z = e^a, accurate for a in (-0.5, 0.5).
+  x = vsl.Sub(x, vsl.Mul(cephes_exp_C1, n));
+  x = vsl.Sub(x, vsl.Mul(cephes_exp_C2, n));
+  llvm::Value* z = vsl.MulAdd(x, cephes_exp_p0, cephes_exp_p1);
+  z = vsl.MulAdd(z, x, cephes_exp_p2);
+  z = vsl.MulAdd(z, x, cephes_exp_p3);
+  z = vsl.MulAdd(z, x, cephes_exp_p4);
+  z = vsl.MulAdd(z, x, cephes_exp_p5);
+  z = vsl.MulAdd(z, vsl.Mul(x, x), x);
+  z = vsl.Add(one, z);
+
+  // Convert n to an i32.  This is safe because we clamped it above.
+  llvm::Value* n_i32 =
+      b->CreateFPToSI(n, llvm::VectorType::get(b->getInt32Ty(), vector_width));
+
+  // Create 2^n as an fp32.  This works because -126 <= n <= 127 means that n is
+  // within the bounds for an fp32 exponent.
+  auto splat_i32 = [&](int32 v) {
+    return b->CreateVectorSplat(vector_width, b->getInt32(v));
+  };
+  const int32 kF32SignificandBits = 23;
+  llvm::Value* exp_bias = splat_i32(0x7f);
+  llvm::Value* pow2 =
+      b->CreateBitCast(b->CreateShl(b->CreateAdd(n_i32, exp_bias),
+                                    splat_i32(kF32SignificandBits)),
+                       vsl.vector_type());
+
+  // Return z * 2^n.
+  return vsl.Mul(z, pow2);
 }
 
 llvm::Value* GenerateVF32Log(llvm::IRBuilder<>* b, llvm::Value* input,
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index 4ac68691e73..bf55e9e22cf 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h"
 
 #include <stdint.h>
+
 #include <algorithm>
 #include <list>
 #include <utility>
@@ -28,7 +29,6 @@ limitations under the License.
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Host.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
-#include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/service/cpu/orc_jit_memory_mapper.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_conv2d.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_conv2d_mkl.h"
@@ -42,6 +42,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "tensorflow/compiler/xla/service/cpu/windows_compatibility.h"
+#include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -146,16 +147,18 @@ llvm::JITSymbol SimpleOrcJIT::ResolveRuntimeSymbol(const std::string& name) {
     // On Mac OS X, 'name' may have a leading underscore prefix, even though the
     // registered name may not.
     std::string stripped_name(name.begin() + 1, name.end());
-    func_addr = CustomCallTargetRegistry::Global()->Lookup(stripped_name);
+    func_addr =
+        xla::CustomCallTargetRegistry::Global()->Lookup(stripped_name, "Host");
   } else {
-    func_addr = CustomCallTargetRegistry::Global()->Lookup(name);
+    func_addr = xla::CustomCallTargetRegistry::Global()->Lookup(name, "Host");
   }
 
   if (func_addr == nullptr) {
     LOG(ERROR)
         << "Unable to resolve runtime symbol: `" << name
         << "'.  Hint: if the symbol a custom call target, make sure you've "
-           "registered it with the JIT using REGISTER_CUSTOM_CALL_TARGET.";
+           "registered it with the JIT using "
+           "XLA_CPU_REGISTER_CUSTOM_CALL_TARGET.";
     return nullptr;
   }
   llvm::JITEvaluatedSymbol symbol_info(reinterpret_cast<uint64_t>(func_addr),
@@ -209,14 +212,15 @@ llvm::JITSymbol SimpleOrcJIT::FindCompiledSymbol(const std::string& name) {
 namespace {
 // Register some known symbols with the CustomCallTargetRegistry.
 bool RegisterKnownJITSymbols() {
-  CustomCallTargetRegistry* registry = CustomCallTargetRegistry::Global();
+  xla::CustomCallTargetRegistry* registry =
+      xla::CustomCallTargetRegistry::Global();
 
 #define REGISTER_CPU_RUNTIME_SYMBOL(base_name)                               \
   do {                                                                       \
     auto* function_address =                                                 \
         reinterpret_cast<void*>(__xla_cpu_runtime_##base_name);              \
     registry->Register(xla::cpu::runtime::k##base_name##SymbolName,          \
-                       function_address);                                    \
+                       function_address, "Host");                            \
     CHECK_EQ(absl::string_view(xla::cpu::runtime::k##base_name##SymbolName), \
              "__xla_cpu_runtime_" #base_name);                               \
   } while (false)
@@ -247,8 +251,10 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(TracingStart);
   REGISTER_CPU_RUNTIME_SYMBOL(TracingEnd);
 
-  registry->Register("__gnu_f2h_ieee", reinterpret_cast<void*>(__gnu_f2h_ieee));
-  registry->Register("__gnu_h2f_ieee", reinterpret_cast<void*>(__gnu_h2f_ieee));
+  registry->Register("__gnu_f2h_ieee", reinterpret_cast<void*>(__gnu_f2h_ieee),
+                     "Host");
+  registry->Register("__gnu_h2f_ieee", reinterpret_cast<void*>(__gnu_h2f_ieee),
+                     "Host");
 
 #undef REGISTER_CPU_RUNTIME_SYMBOL
 
@@ -256,11 +262,12 @@ bool RegisterKnownJITSymbols() {
 // Unfortunately the double versions are overloaded on some systems, e.g.
 // Mac so we need an explicit cast. This requires passing the function signature
 // for that case.
-#define REGISTER_LIBM_SYMBOL(name, double_sig)                          \
-  do {                                                                  \
-    registry->Register(#name "f", reinterpret_cast<void*>(name##f));    \
-    registry->Register(                                                 \
-        #name, reinterpret_cast<void*>(static_cast<double_sig>(name))); \
+#define REGISTER_LIBM_SYMBOL(name, double_sig)                                 \
+  do {                                                                         \
+    registry->Register(#name "f", reinterpret_cast<void*>(name##f), "Host");   \
+    registry->Register(#name,                                                  \
+                       reinterpret_cast<void*>(static_cast<double_sig>(name)), \
+                       "Host");                                                \
   } while (false)
 
   REGISTER_LIBM_SYMBOL(acos, double (*)(double));
@@ -318,8 +325,9 @@ bool RegisterKnownJITSymbols() {
 #ifdef __APPLE__
   REGISTER_LIBM_SYMBOL(__sincos, void (*)(double, double*, double*));
   registry->Register("__sincosf_stret",
-                     reinterpret_cast<void*>(__sincosf_stret));
-  registry->Register("__sincos_stret", reinterpret_cast<void*>(__sincos_stret));
+                     reinterpret_cast<void*>(__sincosf_stret), "Host");
+  registry->Register("__sincos_stret", reinterpret_cast<void*>(__sincos_stret),
+                     "Host");
 #else
   REGISTER_LIBM_SYMBOL(sincos, void (*)(double, double*, double*));
 #endif
@@ -332,19 +340,19 @@ bool RegisterKnownJITSymbols() {
 
 #undef REGISTER_LIBM_SYMBOL
 
-  registry->Register("memcpy", reinterpret_cast<void*>(memcpy));
-  registry->Register("memmove", reinterpret_cast<void*>(memmove));
-  registry->Register("memset", reinterpret_cast<void*>(memset));
+  registry->Register("memcpy", reinterpret_cast<void*>(memcpy), "Host");
+  registry->Register("memmove", reinterpret_cast<void*>(memmove), "Host");
+  registry->Register("memset", reinterpret_cast<void*>(memset), "Host");
 
 #ifdef __APPLE__
-  registry->Register("__bzero", reinterpret_cast<void*>(bzero));
+  registry->Register("__bzero", reinterpret_cast<void*>(bzero), "Host");
   registry->Register("memset_pattern16",
-                     reinterpret_cast<void*>(memset_pattern16));
+                     reinterpret_cast<void*>(memset_pattern16), "Host");
 #endif
 
 #ifdef MEMORY_SANITIZER
   registry->Register("__msan_unpoison",
-                     reinterpret_cast<void*>(__msan_unpoison));
+                     reinterpret_cast<void*>(__msan_unpoison), "Host");
 #endif
 
   return true;
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
index 1bd4b59dd60..b15ad1e162d 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
@@ -107,13 +107,19 @@ llvm::Value* VectorSupportLibrary::Div(llvm::Value* lhs, llvm::Value* rhs) {
 llvm::Value* VectorSupportLibrary::Clamp(llvm::Value* a,
                                          const llvm::APFloat& low,
                                          const llvm::APFloat& high) {
+  CHECK(!low.isNaN());
+  CHECK(!high.isNaN());
+  CHECK(low.compare(high) == llvm::APFloat::cmpLessThan);
+
   AssertCorrectTypes({a});
   llvm::Type* type = a->getType();
-  CHECK(low.compare(high) == llvm::APFloat::cmpLessThan);
   CHECK(scalar_type_->isFloatingPointTy());
-  return llvm_ir::EmitFloatMin(
-      llvm_ir::EmitFloatMax(a, GetConstantFloat(type, low), b_),
-      GetConstantFloat(type, high), b_);
+
+  llvm::Value* low_value = GetConstantFloat(type, low);
+  llvm::Value* high_value = GetConstantFloat(type, high);
+  a = b_->CreateSelect(b_->CreateFCmpUGE(a, low_value), a, low_value);
+  a = b_->CreateSelect(b_->CreateFCmpULE(a, high_value), a, high_value);
+  return a;
 }
 
 llvm::Value* VectorSupportLibrary::FCmpEQMask(llvm::Value* lhs,
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.h b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
index c444fd7d4aa..2f8be8c111b 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.h
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
@@ -100,8 +100,10 @@ class VectorSupportLibrary {
 
   llvm::Value* Floor(llvm::Value* a);
 
+  // Precondition: Neither `low` nor `high` is nan.
   llvm::Value* Clamp(llvm::Value* a, const llvm::APFloat& low,
                      const llvm::APFloat& high);
+
   llvm::Value* SplatFloat(const llvm::APFloat& d) {
     return GetConstantFloat(vector_type(), d);
   }
diff --git a/tensorflow/compiler/xla/service/cpu/custom_call_target_registry.cc b/tensorflow/compiler/xla/service/custom_call_target_registry.cc
similarity index 73%
rename from tensorflow/compiler/xla/service/cpu/custom_call_target_registry.cc
rename to tensorflow/compiler/xla/service/custom_call_target_registry.cc
index 5f5803874b7..e6a70211f25 100644
--- a/tensorflow/compiler/xla/service/cpu/custom_call_target_registry.cc
+++ b/tensorflow/compiler/xla/service/custom_call_target_registry.cc
@@ -13,10 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
+#include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
 
 namespace xla {
-namespace cpu {
 
 CustomCallTargetRegistry* CustomCallTargetRegistry::Global() {
   static auto* registry = new CustomCallTargetRegistry;
@@ -24,16 +23,17 @@ CustomCallTargetRegistry* CustomCallTargetRegistry::Global() {
 }
 
 void CustomCallTargetRegistry::Register(const std::string& symbol,
-                                        void* address) {
+                                        void* address,
+                                        const std::string& platform) {
   std::lock_guard<std::mutex> lock(mu_);
-  registered_symbols_[symbol] = address;
+  registered_symbols_[std::make_pair(symbol, platform)] = address;
 }
 
-void* CustomCallTargetRegistry::Lookup(const std::string& symbol) const {
+void* CustomCallTargetRegistry::Lookup(const std::string& symbol,
+                                       const std::string& platform) const {
   std::lock_guard<std::mutex> lock(mu_);
-  auto it = registered_symbols_.find(symbol);
+  auto it = registered_symbols_.find(std::make_pair(symbol, platform));
   return it == registered_symbols_.end() ? nullptr : it->second;
 }
 
-}  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/custom_call_target_registry.h b/tensorflow/compiler/xla/service/custom_call_target_registry.h
new file mode 100644
index 00000000000..06239689e15
--- /dev/null
+++ b/tensorflow/compiler/xla/service/custom_call_target_registry.h
@@ -0,0 +1,92 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CUSTOM_CALL_TARGET_REGISTRY_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CUSTOM_CALL_TARGET_REGISTRY_H_
+
+// This file is depended on by kernels that have to build for mobile devices.
+// For this reason, we avoid relying on TensorFlow and instead only use the
+// standard C++ library.
+
+#include <map>
+#include <mutex>  // NOLINT
+#include <string>
+
+namespace xla {
+
+// XLA JIT compilers use this registry to resolve symbolic CustomCall targets;
+// so when using XLA as a JIT, CustomCall targets need to be registered here
+// with the symbol name used in the CustomCall.
+//
+// The XLA:CPU ahead-of-time (AOT) compiler links using a standard offline
+// linker; so when compiling in CPU AOT mode, you *also* need to make sure the
+// name of the callee (presumably implemented in C++) matches up with the
+// symbolic name used in the CustomCall.
+//
+// We maintain the registry in both the JIT and the AOT cases for simplicity,
+// but we only use it when running in JIT mode.
+class CustomCallTargetRegistry {
+ public:
+  static CustomCallTargetRegistry* Global();
+
+  void Register(const std::string& symbol, void* address,
+                const std::string& platform);
+  void* Lookup(const std::string& symbol, const std::string& platform) const;
+
+ private:
+  // Maps the pair (symbol, platform) to a C function implementing a custom call
+  // named `symbol` for StreamExecutor platform `platform`.
+  //
+  // Different platforms have different ABIs.  TODO(jlebar): Describe them!
+  //
+  // (We std::map rather than std::unordered_map because the STL doesn't provide
+  // a default hasher for pair<string, string>, and we want to avoid pulling in
+  // dependencies that might define this.)
+  std::map<std::pair<std::string, std::string>, void*> registered_symbols_;
+  mutable std::mutex mu_;
+};
+
+class RegisterCustomCallTarget {
+ public:
+  explicit RegisterCustomCallTarget(const std::string& name, void* address,
+                                    const std::string& platform) {
+    CustomCallTargetRegistry::Global()->Register(name, address, platform);
+  }
+};
+
+#define XLA_REGISTER_CUSTOM_CALL_CONCAT(a, b) a##b
+
+#define XLA_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM_HELPER(symbol, address,   \
+                                                        platform, counter) \
+  static ::xla::RegisterCustomCallTarget XLA_REGISTER_CUSTOM_CALL_CONCAT(  \
+      custom_call_target_register, counter)(                               \
+      symbol, reinterpret_cast<void*>(address), platform)
+
+#define XLA_REGISTER_CUSTOM_CALL_TARGET(function, platform) \
+  XLA_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM(#function, function, platform)
+
+#define XLA_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM(symbol, address, platform)  \
+  XLA_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM_HELPER(symbol, address, platform, \
+                                                  __COUNTER__)
+
+// Convenience overloads for registering custom-call targets on the CPU.
+#define XLA_CPU_REGISTER_CUSTOM_CALL_TARGET(function) \
+  XLA_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM(#function, function, "Host")
+
+#define XLA_CPU_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM(symbol, address) \
+  XLA_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM(symbol, address, "Host")
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CUSTOM_CALL_TARGET_REGISTRY_H_
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index 7ef843dc4e4..f45cda806c8 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -117,6 +117,7 @@ class DfsHloVisitorBase {
   virtual Status HandleAllToAll(HloInstructionPtr hlo) = 0;
   virtual Status HandleCollectivePermute(HloInstructionPtr hlo) = 0;
   virtual Status HandleReplicaId(HloInstructionPtr hlo) = 0;
+  virtual Status HandlePartitionId(HloInstructionPtr hlo) = 0;
   virtual Status HandleGetDimensionSize(HloInstructionPtr hlo) = 0;
   virtual Status HandleCompare(HloInstructionPtr hlo) {
     return HandleElementwiseBinary(hlo);
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index 79ce3f82e8c..756ba9025f0 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -109,6 +109,9 @@ class DfsHloVisitorWithDefaultBase
   Status HandleReplicaId(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
+  Status HandlePartitionId(HloInstructionPtr hlo) override {
+    return DefaultAction(hlo);
+  }
   Status HandleRng(HloInstructionPtr random) override {
     return DefaultAction(random);
   }
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index a08ec181d49..e71629526ed 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -24,13 +24,11 @@ limitations under the License.
 #include "absl/types/variant.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
-#include "tensorflow/compiler/xla/service/owning_device_memory.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
@@ -40,6 +38,8 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
+#include "tensorflow/stream_executor/owning_device_memory.h"
 
 namespace xla {
 
@@ -47,13 +47,13 @@ namespace xla {
 // leftover buffers to be released by the caller.
 struct ExecutionOutput {
   ExecutionOutput(ScopedShapedBuffer result,
-                  std::vector<OwningDeviceMemory> to_be_released)
+                  std::vector<se::OwningDeviceMemory> to_be_released)
       : result(std::move(result)), to_be_released(std::move(to_be_released)) {}
   ScopedShapedBuffer result;
 
   // Leftover buffers for the caller to release. Elements in this list are
   // donated input memory buffers that are not reused by XLA as outputs.
-  std::vector<OwningDeviceMemory> to_be_released;
+  std::vector<se::OwningDeviceMemory> to_be_released;
 };
 
 // A given platform's compiler will produce an Executable -- this is a uniform
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index f2bcc19ae1d..8305fe93d3d 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -85,6 +85,24 @@ cc_library(
 #    ],
 #)
 
+tf_cc_test(
+    name = "custom_call_test",
+    srcs = ["custom_call_test.cc"],
+    tags = ["requires-gpu-sm35"],
+    deps = [
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:constants",
+        "//tensorflow/compiler/xla/service:custom_call_target_registry",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//tensorflow/core:test",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
 cc_library(
     name = "stream_assignment",
     srcs = ["stream_assignment.cc"],
@@ -194,6 +212,7 @@ cc_library(
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:buffer_assignment",
+        "//tensorflow/compiler/xla/service:custom_call_target_registry",
         "//tensorflow/compiler/xla/service:elemental_ir_emitter",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_casting_utils",
@@ -282,10 +301,10 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:buffer_assignment",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:span",
@@ -350,6 +369,7 @@ cc_library(
         "convolution_thunk.cc",
         "copy_thunk.cc",
         "cudnn_batchnorm_thunk.cc",
+        "custom_call_thunk.cc",
         "fft_thunk.cc",
         "for_thunk.cc",
         "gemm_thunk.cc",
@@ -370,6 +390,7 @@ cc_library(
         "convolution_thunk.h",
         "copy_thunk.h",
         "cudnn_batchnorm_thunk.h",
+        "custom_call_thunk.h",
         "fft_thunk.h",
         "for_thunk.h",
         "gemm_thunk.h",
@@ -408,7 +429,6 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:buffer_assignment",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_execution_profile",
@@ -428,7 +448,10 @@ cc_library(
         "//tensorflow/stream_executor",
         "//tensorflow/stream_executor:blas",
         "//tensorflow/stream_executor:device_memory",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "//tensorflow/stream_executor:kernel",
+        "//tensorflow/stream_executor/cuda:cuda_stream",
+        "//tensorflow/stream_executor/gpu:gpu_stream",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -438,6 +461,7 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
+        "@local_config_cuda//cuda:cuda_headers",
     ],
 )
 
@@ -476,7 +500,6 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:compiler",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:hlo_pass",
@@ -484,8 +507,8 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:logger",
         "//tensorflow/core:stream_executor_no_cuda",
-        "//tensorflow/core/kernels:conv_ops",
         "//tensorflow/core/util/proto:proto_utils",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -501,8 +524,8 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:device_memory_allocator",
     ],
 )
 
@@ -518,12 +541,12 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/stream_executor:device_memory",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "//tensorflow/stream_executor:stream_executor_headers",
     ],
 )
@@ -537,12 +560,12 @@ tf_cc_test(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
         "//tensorflow/core/platform/default/build_config:stream_executor_cuda",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "//tensorflow/stream_executor:event",
         "//tensorflow/stream_executor:kernel",
         "//tensorflow/stream_executor/cuda:cuda_activation",
@@ -635,12 +658,12 @@ cc_library(
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/stream_executor:blas",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/types:optional",
     ],
 )
@@ -1165,7 +1188,6 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/stream_executor:stream_executor_headers",
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
index f46a1bc51d9..3afc18d949a 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
@@ -39,7 +39,7 @@ void BufferAllocations::Builder::RegisterBuffer(BufferAllocation::Index index,
 
 StatusOr<std::unique_ptr<BufferAllocations>> BufferAllocations::Builder::Build(
     const BufferAssignment* buffer_assignment, int device_ordinal,
-    DeviceMemoryAllocator* memory_allocator) {
+    se::DeviceMemoryAllocator* memory_allocator) {
   const int64 num_buffers = buffer_assignment->Allocations().size();
   auto buffer_allocations = absl::WrapUnique(new BufferAllocations(
       num_buffers, device_ordinal, memory_allocator, buffer_assignment));
@@ -77,7 +77,7 @@ StatusOr<std::unique_ptr<BufferAllocations>> BufferAllocations::Builder::Build(
       const int64 buffer_size = allocation.size();
       se::DeviceMemoryBase buffer_address;
       if (buffer_size > 0) {
-        OwningDeviceMemory buffer;
+        se::OwningDeviceMemory buffer;
         TF_ASSIGN_OR_RETURN(
             buffer, memory_allocator->Allocate(device_ordinal, buffer_size));
         if (reinterpret_cast<uintptr_t>(buffer.opaque()) % expected_alignment !=
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
index 9413ac2cff7..cf78b92fe5b 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
@@ -23,9 +23,9 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 namespace gpu {
@@ -50,7 +50,7 @@ class BufferAllocations {
     // memory on.
     StatusOr<std::unique_ptr<BufferAllocations>> Build(
         const BufferAssignment* buffer_assignment, int device_ordinal,
-        DeviceMemoryAllocator* memory_allocator);
+        se::DeviceMemoryAllocator* memory_allocator);
 
    private:
     absl::flat_hash_map<BufferAllocation::Index, se::DeviceMemoryBase>
@@ -62,7 +62,9 @@ class BufferAllocations {
   BufferAllocations(const BufferAllocations&) = delete;
   BufferAllocations& operator=(const BufferAllocations&) = delete;
 
-  DeviceMemoryAllocator* memory_allocator() const { return memory_allocator_; }
+  se::DeviceMemoryAllocator* memory_allocator() const {
+    return memory_allocator_;
+  }
   int device_ordinal() const { return device_ordinal_; }
 
   // Returns the device address of buffer `buffer_index`. `buffer_index` must be
@@ -84,7 +86,7 @@ class BufferAllocations {
 
  private:
   BufferAllocations(BufferAllocation::Index buffer_count, int device_ordinal,
-                    DeviceMemoryAllocator* memory_allocator,
+                    se::DeviceMemoryAllocator* memory_allocator,
                     const BufferAssignment* buffer_assignment)
       : buffers_(buffer_count),
         device_ordinal_(device_ordinal),
@@ -104,7 +106,7 @@ class BufferAllocations {
   se::DeviceMemoryBase temp_buffer_base_;
 
   int device_ordinal_;
-  DeviceMemoryAllocator* memory_allocator_;
+  se::DeviceMemoryAllocator* memory_allocator_;
   const BufferAssignment* buffer_assignment_;
   bool torn_down_ = false;
 };
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
index c110b338b65..b3f274e1130 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
@@ -256,9 +256,9 @@ StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithmNoCache(
   const auto device_ordinal = stream_exec_->device_ordinal();
 
   // allocator either points to this->allocator_ or, if that's null, to a
-  // StreamExecutorMemoryAllocator for stream_exec_.
-  DeviceMemoryAllocator* allocator;
-  optional<StreamExecutorMemoryAllocator> se_allocator;
+  // se::StreamExecutorMemoryAllocator for stream_exec_.
+  se::DeviceMemoryAllocator* allocator;
+  optional<se::StreamExecutorMemoryAllocator> se_allocator;
   if (allocator_ != nullptr) {
     allocator = allocator_;
   } else {
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
index 664fd7ff1c6..9e8a797739a 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
@@ -19,13 +19,13 @@ limitations under the License.
 #include "absl/time/time.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/protobuf/autotuning.pb.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 namespace gpu {
@@ -38,7 +38,8 @@ class CudnnConvAlgorithmPicker : public HloModulePass {
   // memory while timing the various convolution algorithms.  If it's null,
   // we'll use the default allocator on the StreamExecutor.
   CudnnConvAlgorithmPicker(se::StreamExecutor* stream_exec,
-                           DeviceMemoryAllocator* allocator, Compiler* compiler)
+                           se::DeviceMemoryAllocator* allocator,
+                           Compiler* compiler)
       : stream_exec_(stream_exec), allocator_(allocator), compiler_(compiler) {}
 
   absl::string_view name() const override {
@@ -56,7 +57,7 @@ class CudnnConvAlgorithmPicker : public HloModulePass {
       const HloCustomCallInstruction* instr);
 
   se::StreamExecutor* stream_exec_;                   // never null
-  DeviceMemoryAllocator* allocator_;                  // may be null
+  se::DeviceMemoryAllocator* allocator_;              // may be null
   Compiler* compiler_;
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/cusolver_context.h b/tensorflow/compiler/xla/service/gpu/cusolver_context.h
index fdd89c3a8d5..68b5fb14c6b 100644
--- a/tensorflow/compiler/xla/service/gpu/cusolver_context.h
+++ b/tensorflow/compiler/xla/service/gpu/cusolver_context.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <complex>
 
-#include "cuda/include/cublas_v2.h"
-#include "cuda/include/cusolverDn.h"
+#include "third_party/gpus/cuda/include/cublas_v2.h"
+#include "third_party/gpus/cuda/include/cusolverDn.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
diff --git a/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.cc
index 7861eb1ef04..2ba6e8fc3c5 100644
--- a/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.cc
@@ -174,9 +174,9 @@ StatusOr<bool> CusolverRewriter::RunOnComputation(HloComputation* computation) {
   const auto device_ordinal = stream_exec_->device_ordinal();
 
   // allocator either points to this->allocator_ or, if that's null, to a
-  // StreamExecutorMemoryAllocator for stream_exec_.
-  DeviceMemoryAllocator* allocator;
-  absl::optional<StreamExecutorMemoryAllocator> se_allocator;
+  // se::StreamExecutorMemoryAllocator for stream_exec_.
+  se::DeviceMemoryAllocator* allocator;
+  absl::optional<se::StreamExecutorMemoryAllocator> se_allocator;
   if (allocator_ != nullptr) {
     allocator = allocator_;
   } else {
@@ -200,7 +200,7 @@ StatusOr<bool> CusolverRewriter::RunOnComputation(HloComputation* computation) {
 }
 
 CusolverRewriter::CusolverRewriter(se::StreamExecutor* stream_exec,
-                                   DeviceMemoryAllocator* allocator)
+                                   se::DeviceMemoryAllocator* allocator)
     : stream_exec_(stream_exec), allocator_(allocator) {}
 
 StatusOr<bool> CusolverRewriter::Run(HloModule* module) {
diff --git a/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.h b/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.h
index c82233188f7..d8c2cc55872 100644
--- a/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.h
+++ b/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.h
@@ -16,12 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUSOLVER_REWRITER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUSOLVER_REWRITER_H_
 
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/gpu/cusolver_context.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 namespace gpu {
@@ -30,7 +30,7 @@ namespace gpu {
 class CusolverRewriter : public HloModulePass {
  public:
   CusolverRewriter(se::StreamExecutor* stream_exec,
-                   DeviceMemoryAllocator* allocator);
+                   se::DeviceMemoryAllocator* allocator);
   absl::string_view name() const override { return "cusolver-rewriter"; }
 
   StatusOr<bool> Run(HloModule* module) override;
@@ -39,7 +39,7 @@ class CusolverRewriter : public HloModulePass {
   StatusOr<bool> RunOnComputation(HloComputation* computation);
 
   se::StreamExecutor* stream_exec_;   // never null
-  DeviceMemoryAllocator* allocator_;  // may be null
+  se::DeviceMemoryAllocator* allocator_;  // may be null
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/custom_call_test.cc b/tensorflow/compiler/xla/service/gpu/custom_call_test.cc
new file mode 100644
index 00000000000..741d5ff09e0
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/custom_call_test.cc
@@ -0,0 +1,189 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#include "third_party/gpus/cuda/includes/cuda_headers/third_party/gpus/cuda/include/driver_types.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+class CustomCallTest : public ClientLibraryTestBase {};
+
+bool is_invoked_called = false;
+void Callback_IsInvoked(CUstream /*stream*/, void** /*buffers*/,
+                        const char* /*opaque*/, size_t /*opaque_len*/) {
+  is_invoked_called = true;
+}
+XLA_REGISTER_CUSTOM_CALL_TARGET(Callback_IsInvoked, "CUDA");
+
+TEST_F(CustomCallTest, IsInvoked) {
+  XlaBuilder b(TestName());
+  CustomCall(&b, "Callback_IsInvoked", /*operands=*/{},
+             ShapeUtil::MakeShape(F32, {}),
+             /*opaque=*/"");
+  EXPECT_FALSE(is_invoked_called);
+  TF_ASSERT_OK(Execute(&b, {}).status());
+  EXPECT_TRUE(is_invoked_called);
+}
+
+TEST_F(CustomCallTest, UnknownTarget) {
+  XlaBuilder b(TestName());
+  CustomCall(&b, "UknownTarget", /*operands=*/{}, ShapeUtil::MakeShape(F32, {}),
+             /*opaque=*/"");
+  ASSERT_FALSE(Execute(&b, {}).ok());
+}
+
+void Callback_Memcpy(CUstream stream, void** buffers, const char* /*opaque*/,
+                     size_t /*opaque_len*/) {
+  void* src = buffers[0];
+  void* dst = buffers[1];
+  auto err = cudaMemcpyAsync(dst, src, /*count=*/sizeof(float) * 128,
+                             cudaMemcpyDeviceToDevice, stream);
+  CHECK_EQ(err, cudaSuccess);
+}
+XLA_REGISTER_CUSTOM_CALL_TARGET(Callback_Memcpy, "CUDA");
+TEST_F(CustomCallTest, Memcpy) {
+  XlaBuilder b(TestName());
+  CustomCall(&b, "Callback_Memcpy",
+             /*operands=*/{Broadcast(ConstantR0WithType(&b, F32, 42.0), {128})},
+             ShapeUtil::MakeShape(F32, {128}), /*opaque=*/"");
+  TF_ASSERT_OK_AND_ASSIGN(auto result, ExecuteAndTransfer(&b, {}));
+  EXPECT_THAT(result.data<float>(), ::testing::Each(42));
+}
+
+// Check that opaque handles nulls within the string.
+std::string& kExpectedOpaque = *new std::string("abc\0def", 7);
+void Callback_Opaque(CUstream /*stream*/, void** /*buffers*/,
+                     const char* opaque, size_t opaque_len) {
+  std::string opaque_str(opaque, opaque_len);
+  CHECK_EQ(opaque_str, kExpectedOpaque);
+}
+XLA_REGISTER_CUSTOM_CALL_TARGET(Callback_Opaque, "CUDA");
+TEST_F(CustomCallTest, Opaque) {
+  XlaBuilder b(TestName());
+  CustomCall(&b, "Callback_Opaque", /*operands=*/{},
+             ShapeUtil::MakeShape(F32, {}), kExpectedOpaque);
+  TF_ASSERT_OK(Execute(&b, {}).status());
+}
+
+void Callback_SubBuffers(CUstream stream, void** buffers,
+                         const char* /*opaque*/, size_t /*opaque_len*/) {
+  // `buffers` is a flat array containing device pointers to the following.
+  //
+  //   0: root tuple of param 0
+  //   1:   param 0 at tuple index {0}, shape f32[128]
+  //   2:   param 0 at tuple index {1}, shape f32[256]
+  //   3: root tuple of param 1
+  //   4:   param 1 at tuple index {0}, shape f32[1024]
+  //   5:   param 1 at tuple index {1}, shape f32[8]
+  //   6: root tuple of custom-call result
+  //   7:   result at tuple index {0}, shape f32[8]
+  //   8:   result at tuple index {1}, shape (f32[128], f32[256])
+  //   9:     result at tuple index {1, 0}, shape f32[128]
+  //  10:     result at tuple index {1, 1}, shape f32[256]
+  //  11:   result at tuple index {2}, shape f32[1024]
+  //
+  // It's the contract of custom-call that the non-root pointers (i.e.
+  // everything other than indices 0, 3, and 6) may be null, if XLA is unable to
+  // analyze the program well enough to determine for sure what's in those
+  // buffers.  For this simple example, all of the buffers should be non-null.
+
+  // Check the param 0 tuple, namely that
+  //
+  //   (*buffers[0])[0] == buffers[1] and
+  //   (*buffers[0])[1] == buffers[2].
+  //
+  // because buffers contains pointers to device memory, we have to retrieve
+  // these values via cudaMemcpy.
+  void* p0[2];
+  cudaMemcpy(p0, buffers[0], 2 * sizeof(void*), cudaMemcpyDeviceToHost);
+  CHECK_EQ(p0[0], buffers[1]);
+  CHECK_EQ(p0[1], buffers[2]);
+
+  // Check the param 1 tuple, namely that
+  //
+  //   (*buffers[3])[0] == buffers[4]
+  //   (*buffers[3])[1] == buffers[5].
+  void* p1[2];
+  cudaMemcpy(p1, buffers[3], 2 * sizeof(void*), cudaMemcpyDeviceToHost);
+  CHECK_EQ(p1[0], buffers[4]);
+  CHECK_EQ(p1[1], buffers[5]);
+
+  // We don't have an equivalent check for the output tuple (i.e. we don't check
+  // (*buffers[6])[0] == buffers[7]) because it's up to us to set the tuple
+  // as part of this custom-call.
+
+  // Write the results.  First set the root tuple output buffer to {b7, b8,
+  // b11}.
+  void* root[3] = {buffers[7], buffers[8], buffers[11]};
+  cudaMemcpy(buffers[6], root, 3 * sizeof(void*), cudaMemcpyHostToDevice);
+
+  // Now set the sub-tuple output buffer at index 8 to {b9, b10}.
+  void* sub_tuple[2] = {buffers[9], buffers[10]};
+  cudaMemcpy(buffers[8], sub_tuple, 2 * sizeof(void*), cudaMemcpyDeviceToHost);
+
+  // Now set output leaf buffers 7, 9, 10, and 11, copying data from the
+  // corresponding same-sized inputs.
+  cudaMemcpyAsync(buffers[7], buffers[5], 8 * sizeof(float),
+                  cudaMemcpyDeviceToDevice, stream);
+  cudaMemcpyAsync(buffers[9], buffers[1], 128 * sizeof(float),
+                  cudaMemcpyDeviceToDevice, stream);
+  cudaMemcpyAsync(buffers[10], buffers[2], 256 * sizeof(float),
+                  cudaMemcpyDeviceToDevice, stream);
+  cudaMemcpyAsync(buffers[11], buffers[4], 1024 * sizeof(float),
+                  cudaMemcpyDeviceToDevice, stream);
+}
+XLA_REGISTER_CUSTOM_CALL_TARGET(Callback_SubBuffers, "CUDA");
+TEST_F(CustomCallTest, SubBuffers) {
+  XlaBuilder b(TestName());
+  CustomCall(&b, "Callback_SubBuffers", /*operands=*/
+             {
+                 Tuple(&b,
+                       {
+                           Broadcast(ConstantR0WithType(&b, F32, 1), {128}),
+                           Broadcast(ConstantR0WithType(&b, F32, 2), {256}),
+                       }),
+                 Tuple(&b,
+                       {
+                           Broadcast(ConstantR0WithType(&b, F32, 3), {1024}),
+                           Broadcast(ConstantR0WithType(&b, F32, 4), {8}),
+                       }),
+             },
+             ShapeUtil::MakeTupleShape({
+                 ShapeUtil::MakeShape(F32, {8}),
+                 ShapeUtil::MakeTupleShape({
+                     ShapeUtil::MakeShape(F32, {128}),
+                     ShapeUtil::MakeShape(F32, {256}),
+                 }),
+                 ShapeUtil::MakeShape(F32, {1024}),
+             }),
+             /*opaque=*/"");
+  TF_ASSERT_OK_AND_ASSIGN(auto result, ExecuteAndTransfer(&b, {}));
+  EXPECT_THAT(result.data<float>({0}), ::testing::Each(4));
+  EXPECT_THAT(result.data<float>({1, 0}), ::testing::Each(1));
+  EXPECT_THAT(result.data<float>({1, 1}), ::testing::Each(2));
+  EXPECT_THAT(result.data<float>({2}), ::testing::Each(3));
+}
+
+}  // anonymous namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc b/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc
new file mode 100644
index 00000000000..f0f3152ac98
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc
@@ -0,0 +1,81 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/custom_call_thunk.h"
+
+#include "absl/strings/str_format.h"
+#include "tensorflow/stream_executor/cuda/cuda_stream.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
+
+namespace xla {
+namespace gpu {
+
+CustomCallThunk::CustomCallThunk(
+    void* call_target,
+    std::vector<ShapeTree<BufferAllocation::Slice>> operand_slices,
+    ShapeTree<BufferAllocation::Slice> result_slices, std::string opaque,
+    const HloInstruction* instr)
+    : Thunk(Thunk::kCustomCall, instr),
+      call_target_(call_target),
+      operand_slices_(std::move(operand_slices)),
+      result_slices_(std::move(result_slices)),
+      opaque_(std::move(opaque)) {
+  CHECK_EQ(instr->operand_count(), operand_slices_.size());
+  for (int64 i = 0; i < instr->operand_count(); ++i) {
+    const auto& s1 = operand_slices_[i].shape();
+    const auto& s2 = instr->operand(i)->shape();
+    CHECK(ShapeUtil::Equal(s1, s2)) << absl::StreamFormat(
+        "Shape mismatch between instr->operand(%d) and "
+        "operand_slices[%d].shape(): %s vs %s",
+        i, i, s1.ToString(), s2.ToString());
+  }
+  CHECK(ShapeUtil::Equal(instr->shape(), result_slices.shape()))
+      << absl::StreamFormat(
+             "Shape mismatch between instr->shape() and result_slices.shape(): "
+             "%s vs %s.",
+             instr->shape().ToString(), result_slices.shape().ToString());
+}
+
+Status CustomCallThunk::ExecuteOnStream(
+    const BufferAllocations& buffer_allocations, se::Stream* stream,
+    HloExecutionProfiler* profiler) {
+  // gpu_stream is CUstream or e.g. the equivalent type in ROCm.
+  auto gpu_stream = se::gpu::AsGpuStreamValue(stream);
+  auto typed_call_target =
+      reinterpret_cast<void (*)(decltype(gpu_stream), void** /*buffers*/,
+                                const char* /*opaque*/, size_t /*opaque_len*/)>(
+          call_target_);
+
+  std::vector<void*> buffers;
+  auto append_buffers = [&](const ShapeTree<BufferAllocation::Slice>& slices) {
+    slices.ForEachElement([&](const ShapeIndex& /*index*/,
+                              const BufferAllocation::Slice& slice) {
+      if (slice.allocation() == nullptr) {
+        buffers.push_back(nullptr);
+      }
+      buffers.push_back(buffer_allocations.GetDeviceAddress(slice).opaque());
+    });
+  };
+  for (const auto& slices : operand_slices_) {
+    append_buffers(slices);
+  }
+  append_buffers(result_slices_);
+
+  typed_call_target(gpu_stream, buffers.data(), opaque_.data(), opaque_.size());
+  return Status::OK();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/custom_call_thunk.h b/tensorflow/compiler/xla/service/gpu/custom_call_thunk.h
new file mode 100644
index 00000000000..9011fa26ffa
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/custom_call_thunk.h
@@ -0,0 +1,61 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUSTOM_CALL_THUNK_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUSTOM_CALL_THUNK_H_
+
+#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
+#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
+#include "tensorflow/compiler/xla/service/gpu/thunk.h"
+
+namespace xla {
+namespace gpu {
+
+// Thunk to run a GPU custom call.
+//
+// This thunk's `ExecuteOnStream` implementation executes a host function
+// `call_target` which is expected to enqueue operations onto the GPU.
+//
+// For information about the calling convention, see xla/g3doc/custom_call.md
+//
+// Note that not all kCustomCall HLOs in XLA:GPU end up being run by this thunk.
+// XLA itself creates kCustomCall instructions when lowering kConvolution HLOs
+// into calls to cudnn.  These internally-created custom-calls are run using
+// ConvolutionThunk, not CustomCallThunk.  There's no ambiguity because they
+// have special call target names (e.g. "__cudnn$convForward") that only the
+// compiler is allowed to create.
+class CustomCallThunk : public Thunk {
+ public:
+  CustomCallThunk(
+      void* call_target,
+      std::vector<ShapeTree<BufferAllocation::Slice>> operand_slices,
+      ShapeTree<BufferAllocation::Slice> result_slices, std::string opaque,
+      const HloInstruction* instr);
+
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         se::Stream* stream,
+                         HloExecutionProfiler* profiler) override;
+
+ private:
+  void* call_target_;
+  std::vector<ShapeTree<BufferAllocation::Slice>> operand_slices_;
+  ShapeTree<BufferAllocation::Slice> result_slices_;
+  std::string opaque_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUSTOM_CALL_THUNK_H_
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index 25de339c1a3..ffa60da6f16 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -71,7 +71,6 @@ GpuElementalIrEmitter::GpuElementalIrEmitter(
     const HloModuleConfig& hlo_module_config, llvm::Module* module,
     llvm::IRBuilder<>* b, NestedComputer compute_nested)
     : ElementalIrEmitter(hlo_module_config, module, b),
-      hlo_module_config_(hlo_module_config),
       compute_nested_(std::move(compute_nested)) {}
 
 StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitLibdeviceMathCall(
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
index 2aedbf05abb..466543a2f92 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
@@ -126,7 +126,6 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
       const string& callee_name, absl::Span<llvm::Value* const> operands,
       absl::Span<const PrimitiveType> input_types, PrimitiveType output_type);
 
-  const HloModuleConfig& hlo_module_config_;
   NestedComputer compute_nested_;
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
index ca4a605af5d..8f40010bdcb 100644
--- a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
@@ -29,7 +29,7 @@ namespace xla {
 namespace gpu {
 
 FftScratchAllocator::FftScratchAllocator(
-    int device_ordinal, DeviceMemoryAllocator* memory_allocator)
+    int device_ordinal, se::DeviceMemoryAllocator* memory_allocator)
     : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}
 
 int64 FftScratchAllocator::GetMemoryLimitInBytes(se::Stream* stream) {
@@ -48,7 +48,7 @@ StatusOr<se::DeviceMemory<uint8>> FftScratchAllocator::AllocateBytes(
             byte_size, GetMemoryLimitInBytes(stream)));
   }
 
-  TF_ASSIGN_OR_RETURN(OwningDeviceMemory allocated_buffer,
+  TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory allocated_buffer,
                       memory_allocator_->Allocate(device_ordinal_, byte_size,
                                                   /*retry_on_failure=*/false));
   total_allocated_bytes_ += byte_size;
diff --git a/tensorflow/compiler/xla/service/gpu/fft_thunk.h b/tensorflow/compiler/xla/service/gpu/fft_thunk.h
index 2be50e08bd2..f653e4f12fe 100644
--- a/tensorflow/compiler/xla/service/gpu/fft_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.h
@@ -38,7 +38,7 @@ namespace gpu {
 class FftScratchAllocator : public se::ScratchAllocator {
  public:
   FftScratchAllocator(int device_ordinal,
-                      DeviceMemoryAllocator* memory_allocator);
+                      se::DeviceMemoryAllocator* memory_allocator);
 
   int64 GetMemoryLimitInBytes(se::Stream* stream) override;
 
@@ -49,8 +49,8 @@ class FftScratchAllocator : public se::ScratchAllocator {
 
  private:
   const int device_ordinal_;
-  DeviceMemoryAllocator* memory_allocator_;
-  std::vector<OwningDeviceMemory> allocated_buffers_;
+  se::DeviceMemoryAllocator* memory_allocator_;
+  std::vector<se::OwningDeviceMemory> allocated_buffers_;
   int64 total_allocated_bytes_ = 0;
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index f65ff9b2cd8..dec40c5e49c 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -230,7 +230,7 @@ StatusOr<ScopedShapedBuffer> GpuExecutable::Execute(
     const ServiceExecutableRunOptions* run_options,
     absl::Span<const ShapedBuffer* const> arguments,
     HloExecutionProfile* hlo_execution_profile, bool block_host_until_done) {
-  DeviceMemoryAllocator* memory_allocator = run_options->allocator();
+  se::DeviceMemoryAllocator* memory_allocator = run_options->allocator();
 
   if (GetRootPointsToSet().IsAmbiguous()) {
     return Unimplemented("Points-to set of root instruction is ambiguous");
@@ -348,7 +348,7 @@ StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteOnStream(
 StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
     absl::Span<const ShapedBuffer* const> arguments) {
-  DeviceMemoryAllocator* memory_allocator = run_options->allocator();
+  se::DeviceMemoryAllocator* memory_allocator = run_options->allocator();
   // Force synchronous execution if the allocator requires it.
   bool block_host_until_done =
       !memory_allocator->AllowsAsynchronousDeallocation();
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index 8e71647a0da..b1f63bc672e 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -24,7 +24,6 @@ limitations under the License.
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
@@ -38,6 +37,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 namespace gpu {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
index a78b4ff8307..b9d944b5dc1 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
@@ -32,10 +32,12 @@ class IrEmitterContext {
  public:
   IrEmitterContext(const HloModule* hlo_module,
                    const BufferAssignment* buffer_assignment,
+                   const se::Platform* platform,
                    const se::DeviceDescription* device_desc,
                    llvm::Module* llvm_module)
       : hlo_module_(hlo_module),
         buffer_assignment_(buffer_assignment),
+        platform_(platform),
         device_desc_(device_desc),
         llvm_module_(llvm_module) {}
   // Disallow copy and assign.
@@ -47,6 +49,7 @@ class IrEmitterContext {
   const BufferAssignment& buffer_assignment() const {
     return *buffer_assignment_;
   }
+  const se::Platform* platform() const { return platform_; }
   const se::DeviceDescription& device_description() const {
     return *device_desc_;
   }
@@ -56,6 +59,7 @@ class IrEmitterContext {
  private:
   const HloModule* hlo_module_;
   const BufferAssignment* buffer_assignment_;
+  const se::Platform* platform_;
   const se::DeviceDescription* device_desc_;
   llvm::Module* llvm_module_;
   NameUniquer name_uniquer_;
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 6793aa88f2a..774c2b8682f 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h"
+
 #include <algorithm>
 #include <cstring>
 #include <iterator>
@@ -20,8 +22,6 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h"
-
 #include "absl/algorithm/container.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/cholesky_thunk.h"
@@ -45,6 +46,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/copy_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h"
+#include "tensorflow/compiler/xla/service/gpu/custom_call_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/fft_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/for_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/gemm_thunk.h"
@@ -527,7 +529,35 @@ Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) {
     return Status::OK();
   }
 
-  return IrEmitter::HandleCustomCall(custom_call);
+  if (void* call_target = CustomCallTargetRegistry::Global()->Lookup(
+          custom_call->custom_call_target(),
+          ir_emitter_context_->platform()->Name())) {
+    const auto& assn = ir_emitter_context_->buffer_assignment();
+    auto get_slices_for_instr = [&](const HloInstruction* instr) {
+      ShapeTree<BufferAllocation::Slice> slices(instr->shape());
+      slices.ForEachMutableElement([&](const ShapeIndex& index,
+                                       BufferAllocation::Slice* slice) {
+        StatusOr<BufferAllocation::Slice> s = assn.GetUniqueSlice(instr, index);
+        if (s.ok()) {
+          *slice = s.ValueOrDie();
+        }
+      });
+      return slices;
+    };
+    std::vector<ShapeTree<BufferAllocation::Slice>> operand_slices;
+    for (const auto* operand : custom_call->operands()) {
+      operand_slices.push_back(get_slices_for_instr(operand));
+    }
+    ShapeTree<BufferAllocation::Slice> result_slices =
+        get_slices_for_instr(custom_call);
+    AddThunkToThunkSequence(absl::make_unique<CustomCallThunk>(
+        call_target, std::move(operand_slices), std::move(result_slices),
+        Cast<HloCustomCallInstruction>(custom_call)->opaque(), custom_call));
+    return Status::OK();
+  }
+
+  return Unimplemented("No registered implementation for custom call to \"%s\"",
+                       custom_call->custom_call_target());
 }
 
 Status IrEmitterUnnested::HandleFft(HloInstruction* fft) {
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index d977e0a1629..d028557f5b8 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -164,7 +164,7 @@ string GetLibdeviceDir(const HloModuleConfig& hlo_module_config) {
 // It takes a compiler pointer, as passes may compile and execute HLOs on the
 // fly for cuDNN verification or other purposes.
 Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
-                         DeviceMemoryAllocator* device_allocator,
+                         se::DeviceMemoryAllocator* device_allocator,
                          Compiler* compiler) {
   {
     HloPassPipeline pipeline("optimization");
@@ -463,7 +463,7 @@ NVPTXCompiler::NVPTXCompiler()
 
 StatusOr<std::unique_ptr<HloModule>> NVPTXCompiler::RunHloPasses(
     std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
-    DeviceMemoryAllocator* device_allocator) {
+    se::DeviceMemoryAllocator* device_allocator) {
   // We dump the post-optimization HLO in RunBackend so no need to dump it here.
   XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunHloPasses");
   tensorflow::profiler::TraceMe activity(
@@ -479,7 +479,7 @@ StatusOr<std::unique_ptr<HloModule>> NVPTXCompiler::RunHloPasses(
 
 StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
     std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
-    DeviceMemoryAllocator* device_allocator) {
+    se::DeviceMemoryAllocator* device_allocator) {
   XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunBackend");
 
   TF_RET_CHECK(stream_exec != nullptr);
@@ -521,9 +521,9 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
           /*allocate_buffers_for_constants=*/true));
   DumpHloModuleIfEnabled(*module, *buffer_assignment, "after_optimizations");
 
-  IrEmitterContext ir_emitter_context(module.get(), buffer_assignment.get(),
-                                      &stream_exec->GetDeviceDescription(),
-                                      &llvm_module);
+  IrEmitterContext ir_emitter_context(
+      module.get(), buffer_assignment.get(), stream_exec->platform(),
+      &stream_exec->GetDeviceDescription(), &llvm_module);
 
   HloComputation* entry_computation = module->entry_computation();
   IrEmitterUnnested ir_emitter(module->config(), entry_computation,
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
index b74e5f01c6e..25e4b9427c0 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
@@ -53,11 +53,11 @@ class NVPTXCompiler : public LLVMCompiler {
 
   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
diff --git a/tensorflow/compiler/xla/service/gpu/redzone_allocator.cc b/tensorflow/compiler/xla/service/gpu/redzone_allocator.cc
index c5f812b68cd..b2229971e9f 100644
--- a/tensorflow/compiler/xla/service/gpu/redzone_allocator.cc
+++ b/tensorflow/compiler/xla/service/gpu/redzone_allocator.cc
@@ -50,7 +50,7 @@ StatusOr<se::DeviceMemory<uint8>> RedzoneAllocator::AllocateBytes(
 
   int64 rhs_slop = RoundUpToNearest(byte_size, kRhsRedzoneAlign) - byte_size;
   TF_ASSIGN_OR_RETURN(
-      OwningDeviceMemory allocated_buffer,
+      se::OwningDeviceMemory allocated_buffer,
       memory_allocator_->Allocate(device_ordinal_,
                                   byte_size + 2 * redzone_size_ + rhs_slop,
                                   /*retry_on_failure=*/false));
diff --git a/tensorflow/compiler/xla/service/gpu/redzone_allocator.h b/tensorflow/compiler/xla/service/gpu/redzone_allocator.h
index f92167bcc42..4e3438c6dfc 100644
--- a/tensorflow/compiler/xla/service/gpu/redzone_allocator.h
+++ b/tensorflow/compiler/xla/service/gpu/redzone_allocator.h
@@ -18,12 +18,12 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
-#include "tensorflow/compiler/xla/service/owning_device_memory.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
+#include "tensorflow/stream_executor/owning_device_memory.h"
 
 namespace xla {
 namespace gpu {
@@ -41,7 +41,8 @@ namespace gpu {
 // memory for cudnn convolutions.
 class RedzoneAllocator : public se::ScratchAllocator {
  public:
-  RedzoneAllocator(int device_ordinal, DeviceMemoryAllocator* memory_allocator,
+  RedzoneAllocator(int device_ordinal,
+                   se::DeviceMemoryAllocator* memory_allocator,
                    const HloModuleConfig& hlo_module_config,
                    int64 redzone_size = 1 << 23,  // 8MiB per side, 16MiB total
                    uint8 redzone_pattern = -1)
@@ -76,14 +77,14 @@ class RedzoneAllocator : public se::ScratchAllocator {
   const int64 redzone_size_;
 
   const uint8 redzone_pattern_;
-  DeviceMemoryAllocator* memory_allocator_;
+  se::DeviceMemoryAllocator* memory_allocator_;
   const HloModuleConfig& hlo_module_config_;
 
   // The second element of the pair is the size of the user allocation.  This
   // isn't necessarily just first.size() - 2 * redzone_size_ because when the
   // user allocation size is not a multiple of 4 bytes, we round up the size of
   // the RHS redzone.
-  std::vector<std::pair<OwningDeviceMemory, int64>> allocated_buffers_;
+  std::vector<std::pair<se::OwningDeviceMemory, int64>> allocated_buffers_;
 
   int64 allocated_bytes_excluding_redzones_ = 0;
 };
diff --git a/tensorflow/compiler/xla/service/gpu/redzone_allocator_test.cc b/tensorflow/compiler/xla/service/gpu/redzone_allocator_test.cc
index a36aaa3f216..a3b0ac3ecae 100644
--- a/tensorflow/compiler/xla/service/gpu/redzone_allocator_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/redzone_allocator_test.cc
@@ -15,13 +15,13 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/redzone_allocator.h"
 
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 #include "tensorflow/stream_executor/platform.h"
 
@@ -42,7 +42,7 @@ TEST(RedzoneAllocatorTest, WriteToRedzone) {
       se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
   se::StreamExecutor* stream_exec = platform->ExecutorForDevice(0).ValueOrDie();
   HloModuleConfig config;
-  StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec});
+  se::StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec});
   RedzoneAllocator allocator(/*device_ordinal=*/0, &se_allocator, config,
                              kRedzoneSize, kRedzonePattern);
 
@@ -118,7 +118,7 @@ TEST(RedzoneAllocatorTest, VeryLargeRedzone) {
       se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
   se::StreamExecutor* stream_exec = platform->ExecutorForDevice(0).ValueOrDie();
   HloModuleConfig config;
-  StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec});
+  se::StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec});
   RedzoneAllocator allocator(/*device_ordinal=*/0, &se_allocator, config,
                              kRedzoneSize, /*redzone_pattern=*/-1);
   se::Stream stream(stream_exec);
diff --git a/tensorflow/compiler/xla/service/gpu/scratch_allocator.cc b/tensorflow/compiler/xla/service/gpu/scratch_allocator.cc
index 197367e8168..7a3220483a8 100644
--- a/tensorflow/compiler/xla/service/gpu/scratch_allocator.cc
+++ b/tensorflow/compiler/xla/service/gpu/scratch_allocator.cc
@@ -29,7 +29,7 @@ StatusOr<se::DeviceMemory<uint8>> ScratchAllocator::AllocateBytes(
             byte_size, GetMemoryLimitInBytes(stream)));
   }
 
-  TF_ASSIGN_OR_RETURN(OwningDeviceMemory allocated_buffer,
+  TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory allocated_buffer,
                       memory_allocator_->Allocate(device_ordinal_, byte_size,
                                                   /*retry_on_failure=*/false));
   total_allocated_bytes_ += byte_size;
diff --git a/tensorflow/compiler/xla/service/gpu/scratch_allocator.h b/tensorflow/compiler/xla/service/gpu/scratch_allocator.h
index 620c7e78912..a22e7f5ea24 100644
--- a/tensorflow/compiler/xla/service/gpu/scratch_allocator.h
+++ b/tensorflow/compiler/xla/service/gpu/scratch_allocator.h
@@ -18,18 +18,19 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
-#include "tensorflow/compiler/xla/service/owning_device_memory.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
+#include "tensorflow/stream_executor/owning_device_memory.h"
 
 namespace xla {
 namespace gpu {
 
 class ScratchAllocator : public se::ScratchAllocator {
  public:
-  ScratchAllocator(int device_ordinal, DeviceMemoryAllocator* memory_allocator)
+  ScratchAllocator(int device_ordinal,
+                   se::DeviceMemoryAllocator* memory_allocator)
       : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}
 
   int64 GetMemoryLimitInBytes(se::Stream* stream) override {
@@ -50,8 +51,8 @@ class ScratchAllocator : public se::ScratchAllocator {
 
  private:
   const int device_ordinal_;
-  DeviceMemoryAllocator* memory_allocator_;
-  std::vector<OwningDeviceMemory> allocated_buffers_;
+  se::DeviceMemoryAllocator* memory_allocator_;
+  std::vector<se::OwningDeviceMemory> allocated_buffers_;
   int64 total_allocated_bytes_ = 0;
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/tests/BUILD b/tensorflow/compiler/xla/service/gpu/tests/BUILD
index d798b316437..b6ce15bb384 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/tests/BUILD
@@ -200,8 +200,8 @@ tf_cc_test(
     tags = tf_cuda_tests_tags(),
     deps = [
         ":gpu_codegen_test",
+        "//tensorflow/compiler/xla/service:custom_call_target_registry",
         "//tensorflow/compiler/xla/service:gpu_plugin",
-        "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
         "//tensorflow/compiler/xla/service/llvm_ir:alias_analysis",
         "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/compiler/xla/tests:llvm_irgen_test_base",
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_alignment_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_alignment_test.cc
index 672c68e59b5..914b81c632f 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_alignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_alignment_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
+#include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h"
 #include "tensorflow/compiler/xla/tests/filecheck.h"
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.cc b/tensorflow/compiler/xla/service/gpu/thunk.cc
index f43e05904dd..5a9b7bdf902 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/thunk.cc
@@ -34,6 +34,8 @@ absl::string_view ThunkKindToString(Thunk::Kind kind) {
       return "kCudnnBatchNormForwardInference";
     case Thunk::kCudnnBatchNormForwardTraining:
       return "kCudnnBatchNormForwardTraining";
+    case Thunk::kCustomCall:
+      return "kCustomCall";
     case Thunk::kNcclAllReduce:
       return "kNcclAllReduce";
     case Thunk::kFft:
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index 6a35f225f71..bdd06718717 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -49,13 +49,14 @@ class Thunk {
     kCudnnBatchNormBackward,
     kCudnnBatchNormForwardInference,
     kCudnnBatchNormForwardTraining,
-    kNcclAllReduce,
+    kCustomCall,
     kFft,
     kGemm,
     kInfeed,
     kKernel,
     kMemset32BitValue,
     kMemzero,
+    kNcclAllReduce,
     kOutfeed,
     kSequential,
     kTriangularSolve,
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index e42808be773..89dbe93b36b 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -198,6 +198,13 @@ class HloComputation {
       const HloComputationProto& proto,
       const absl::flat_hash_map<int64, HloComputation*>& computation_map);
 
+  using InstructionSequence = tensorflow::gtl::iterator_range<
+      UnwrappingIterator<std::list<std::unique_ptr<HloInstruction>>::iterator>>;
+
+  using ConstInstructionSequence =
+      tensorflow::gtl::iterator_range<UnwrappingIterator<
+          std::list<std::unique_ptr<HloInstruction>>::const_iterator>>;
+
   // Gets the instructions in this computation.
   //
   // The returned type is a range of HloInstruction*s, so you can iterate over
@@ -205,15 +212,11 @@ class HloComputation {
   //
   //   for (HloInstruction* instr : computation->instructions()) { ... }
   //
-  tensorflow::gtl::iterator_range<UnwrappingIterator<
-      std::list<std::unique_ptr<HloInstruction>>::const_iterator>>
-  instructions() const {
+  ConstInstructionSequence instructions() const {
     return {MakeUnwrappingIterator(instructions_.begin()),
             MakeUnwrappingIterator(instructions_.end())};
   }
-  tensorflow::gtl::iterator_range<
-      UnwrappingIterator<std::list<std::unique_ptr<HloInstruction>>::iterator>>
-  instructions() {
+  InstructionSequence instructions() {
     return {MakeUnwrappingIterator(instructions_.begin()),
             MakeUnwrappingIterator(instructions_.end())};
   }
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index fbaab1d7d98..8c1b22e0a10 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -638,6 +638,10 @@ Status HloCostAnalysis::HandleCollectivePermute(const HloInstruction* /*hlo*/) {
   return Status::OK();
 }
 
+Status HloCostAnalysis::HandlePartitionId(const HloInstruction* /*hlo*/) {
+  return Status::OK();
+}
+
 Status HloCostAnalysis::HandleReplicaId(const HloInstruction* /*hlo*/) {
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index ab96fa4796f..b76465531f0 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -77,6 +77,7 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   Status HandleAllToAll(const HloInstruction* hlo) override;
   Status HandleCollectivePermute(const HloInstruction* hlo) override;
   Status HandleReplicaId(const HloInstruction* hlo) override;
+  Status HandlePartitionId(const HloInstruction* hlo) override;
   Status HandleInfeed(const HloInstruction* infeed) override;
   Status HandleOutfeed(const HloInstruction* outfeed) override;
   Status HandleRng(const HloInstruction* random) override;
diff --git a/tensorflow/compiler/xla/service/hlo_cse.cc b/tensorflow/compiler/xla/service/hlo_cse.cc
index 849cac278ee..1e7e125d956 100644
--- a/tensorflow/compiler/xla/service/hlo_cse.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse.cc
@@ -143,7 +143,9 @@ StatusOr<bool> HloCSE::Run(HloModule* module) {
     for (auto instruction : computation->MakeInstructionPostOrder()) {
       // If the instruction has zero operands (constants, parameters, etc.) skip
       // over it.
-      if (instruction->operand_count() == 0) {
+      if (instruction->operand_count() == 0 &&
+          instruction->opcode() != HloOpcode::kPartitionId &&
+          instruction->opcode() != HloOpcode::kReplicaId) {
         continue;
       }
       // Skip instructions which have side effects.
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index da58e9608b1..0320979102f 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/xla/index_util.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
@@ -779,6 +780,545 @@ Status HloEvaluator::HandleTuple(HloInstruction* tuple) {
   return Status::OK();
 }
 
+namespace {
+
+// Straightforward implementation of 1D DFT transform. Uses passed-in start
+// index and stride to gather inputs from the data vector into the preallocated
+// buffer, computes the result, and writes it back to the same locations in the
+// data vector. Runs in O(length^2) time.
+//
+// Parameters contract_output and expand_input are used to avoid unnecessary
+// calculations. When contract_output is set to true, then only (length / 2) + 1
+// output values are computed. When expand_input is set to true, then
+// (length / 2) + 1 values from the data set are used to re-create the full set
+// of size 'length', on which the transform is then performed.
+//
+void NaiveDft1D(int64 length, int64 start, int64 stride, bool inverse,
+                bool contract_output, bool expand_input,
+                absl::Span<complex128> data, absl::Span<complex128> buffer) {
+  CHECK_GT(data.size(), start + (length - 1) * stride);
+  CHECK_GT(buffer.size(), length - 1);
+
+  // Copy input data to 1D vector.
+  bool input_is_zero = true;
+  const int64 ub = expand_input ? length / 2 + 1 : length;
+  for (int64 k = 0; k < ub; k++) {
+    complex128 value = data[start + k * stride];
+    input_is_zero &= value == complex128(0.0, 0.0);
+    buffer[k] = value;
+    if (expand_input) {
+      // Use conjugates of the values at indices [1 ... (ub - 2)] when the
+      // length is even and at indices [1 ... (ub - 1)] when the length is odd
+      // to calculate missing values at indices [(length - 1) ... ub].
+      if (k > 0 && k < (length - ub + 1)) {
+        buffer[length - k] = std::conj(value);
+      }
+    }
+  }
+
+  // Do 1D transformation with double precision.
+  if (!input_is_zero) {
+    const int64 ub = contract_output ? length / 2 + 1 : length;
+    for (int64 k = 0; k < ub; k++) {
+      complex128 value = complex128(0.0, 0.0);
+      for (int n = 0; n < length; n++) {
+        auto coeff = std::exp(complex128(0.0, -2.0 * M_PI * n * k / length));
+        value += (inverse ? std::conj(buffer[n]) : buffer[n]) * coeff;
+      }
+      data[start + k * stride] =
+          inverse ? std::conj(value) / complex128(length, 0.0) : value;
+    }
+  }
+}
+
+// Helper to reverse the order of dimension lengths in the passed-in literal.
+std::vector<int64> GetDimensionLengths(const Literal& literal) {
+  std::vector<int64> lengths = literal.shape().dimensions();
+  absl::c_reverse(lengths);
+  return lengths;
+}
+
+// Helper to compute strides for creating linear indices into multidimensional
+// data from the dimension lengths and the layout. Returns a new vector of size
+// lengths.size() + 1. The last element of the returned vector at index
+// [lengths.size()] contains the product of all dimension lengths.
+std::vector<int64> ComputeStrides(const absl::Span<const int64> lengths,
+                                  const Layout& layout) {
+  const int64 num_dimensions = lengths.size();
+
+  // Make sure that the layout length matches the number of dimensions.
+  CHECK_EQ(num_dimensions, layout.minor_to_major_size());
+
+  // Calculate strides using layout-specified ordering of the dimensions and
+  // place the stride for axis 0 at index 0, for axis 1 at index 1, etc.
+  std::vector<int64> strides(num_dimensions + 1);
+  int64 stride = 1;
+  for (int64 i = 0; i < num_dimensions; i++) {
+    // Reverse the ordering of the dimensions in the layout.
+    const int64 index = (num_dimensions - 1) - layout.minor_to_major(i);
+    strides[index] = stride;
+    stride *= lengths[index];
+  }
+  strides[num_dimensions] = stride;
+
+  return strides;
+}
+
+// Compute strides as above using the default layout.
+std::vector<int64> ComputeStrides(const absl::Span<const int64> lengths) {
+  return ComputeStrides(lengths,
+                        LayoutUtil::GetDefaultLayoutForRank(lengths.size()));
+}
+
+// Compute strides as above using the layout from the literal, if available.
+std::vector<int64> ComputeStrides(const absl::Span<const int64> lengths,
+                                  const Literal& literal) {
+  return literal.shape().has_layout()
+             ? ComputeStrides(lengths, literal.shape().layout())
+             : ComputeStrides(lengths);
+}
+
+// Make 1D sweeps along each transform axis.
+void Sweep(int64 fft_rank, FftType fft_type,
+           const absl::Span<const int64> fft_lengths,
+           const absl::Span<const int64> fft_strides,
+           absl::Span<complex128> data, absl::Span<complex128> buffer) {
+  const bool inverse = fft_type == FftType::IFFT || fft_type == FftType::IRFFT;
+  const bool input_is_truncated = fft_type == FftType::IRFFT;
+  const bool output_is_truncated = fft_type == FftType::RFFT;
+
+  // Recursively visit each column of the data along the sweep_axis. Calculate
+  // linearized index of that column's first element and the stride, then invoke
+  // 1D transform.
+  // For RFFT, avoid calculating unused output values: first, compute only
+  // (length_x / 2) + 1 values along the X axis, then limit the X coordinate to
+  // [0 ... (length / 2)] during the sweeps along other axes. Similarly, for
+  // IRFFT sweep along higher dimensions first, while keeping the X coordinate
+  // in the [0 ... (length / 2)] range, then re-create negative frequencies
+  // omitted in the input and perform the full-length transform along the X axis
+  // in the last sweep.
+  std::function<void(int64, int64, int64)> sweep = [&](int64 sweep_axis,
+                                                       int64 axis,
+                                                       int64 start) {
+    if (axis < 0) {
+      // Base case: invoke 1D transform.
+      const int64 length = fft_lengths[sweep_axis];
+      const int64 stride = fft_strides[sweep_axis];
+      const bool expand_input = input_is_truncated && sweep_axis == 0;
+      const bool contract_oputput = output_is_truncated && sweep_axis == 0;
+      NaiveDft1D(length, start, stride, inverse, contract_oputput, expand_input,
+                 data, buffer);
+    } else if (axis == sweep_axis) {
+      // Visit only the elements with coordinate 0 along the sweep axis.
+      sweep(sweep_axis, axis - 1, start);
+    } else {
+      const int64 length = fft_lengths[axis];
+      const bool is_truncated = input_is_truncated || output_is_truncated;
+      const int64 ub = is_truncated && axis == 0 ? (length / 2) + 1 : length;
+      for (int64 i = 0; i < ub; i++) {
+        sweep(sweep_axis, axis - 1, start + i * fft_strides[axis]);
+      }
+    }
+  };
+  if (input_is_truncated) {
+    // Sweep along the X axis last for IRFFT.
+    for (int64 sweep_axis = fft_rank - 1; sweep_axis >= 0; sweep_axis--) {
+      sweep(sweep_axis, fft_rank - 1, 0);
+    }
+  } else {
+    // Sweep along the X axis first for RFFT. The order does not matter for FFT
+    // and IFFT types; handle them here as well.
+    for (int64 sweep_axis = 0; sweep_axis < fft_rank; sweep_axis++) {
+      sweep(sweep_axis, fft_rank - 1, 0);
+    }
+  }
+}
+
+// These templates convert the data from the input data type to the type used in
+// calculations and then to the output data type. They are intended to be used
+// only within the DFT implementation. One special case is IRFFT, where the
+// specialization drops imaginary parts of complex values (which is expected to
+// be 0) and returns real numbers.
+template <typename ToType, typename FromType>
+ToType GetAs(FromType value) {
+  return static_cast<ToType>(value);
+}
+
+template <>
+float GetAs<float, complex128>(complex128 value) {
+  return static_cast<float>(value.real());
+}
+
+// This template generates two linearized indices, which can be used to access
+// multidimensional arrays. It uses a recursive function, which passes the
+// indices to the user-supplied callback function. The destination index is
+// always within dst_lengths[] bounds. The boolean parameter within_src_bounds
+// indicates whether the source index is within src_lengths[] bounds.
+//
+// The value returned from the callback function controls the recursion depth.
+// Returning true indicates that the base case had been hit and the recursion
+// stops. Otherwise, the recursion proceeds along the next less-major axis.
+//
+// For example, the base case when the axis value becomes negative invokes the
+// callback function for each possible index within dst_lengths[] bounds. The
+// base case when the axis value is equal to zero limits the indices to point
+// only to first elements along the minor-most dimension, allowing the callback
+// function to handle all values along the X axis.
+//
+template <typename BaseFn>
+void GenerateIndices(const absl::Span<const int64> dst_lengths,
+                     const absl::Span<const int64> dst_strides,
+                     const absl::Span<const int64> src_lengths,
+                     const absl::Span<const int64> src_strides, int64 fft_rank,
+                     int64 dst_start, int64 src_start, BaseFn&& base) {
+  CHECK_EQ(dst_lengths.size() + 1, dst_strides.size());
+  CHECK_GE(dst_lengths.size(), fft_rank);
+  CHECK_EQ(src_lengths.size() + 1, src_strides.size());
+  CHECK_GE(src_lengths.size(), fft_rank);
+
+  std::function<void(int64, int64, int64, bool)> generate =
+      [&](int64 axis, int64 dst_index, int64 src_index,
+          bool within_src_bounds) {
+        if (!base(axis, dst_index, src_index, within_src_bounds)) {
+          for (int64 i = 0; i < dst_lengths[axis]; i++) {
+            // Because the loop goes over dst_lengths[], the source index may be
+            // out of src_lengths[] bounds. In this case, within_src_bounds is
+            // false.
+            within_src_bounds &= i < src_lengths[axis];
+            generate(axis - 1, dst_index, src_index, within_src_bounds);
+            dst_index += dst_strides[axis];
+            src_index += src_strides[axis];
+          }
+        }
+      };
+  generate(fft_rank - 1, dst_start, src_start, true);
+}
+
+// Copies the input data from a literal to a pre-allocated vector. The sizes of
+// the input and the transform do not need to match. For each axis of the
+// transform, any extra input values beyond the transform length are ignored.
+// Conversely, if the input does not contain enough elements along any axis, the
+// data is padded with zeroes.
+//
+// For IRFFT transforms, we use (length_x / 2) + 1 elements from the input,
+// where length_x is the size of the full transform along the X axis.
+//
+// The input literal may have a rank higher than the rank of the transform.
+// Passed-in input_index value points to the first element of the input literal
+// to be copied.
+//
+// Returns true if all values in the work data set are zeroes.
+//
+template <typename InputType>
+bool CopyDataFromInput(const Literal& input_literal, int64 input_start,
+                       int64 fft_rank, FftType fft_type, int64 fft_size,
+                       const absl::Span<const int64> fft_lengths,
+                       const absl::Span<const int64> fft_strides,
+                       const absl::Span<const int64> input_lengths,
+                       const absl::Span<const int64> input_strides,
+                       absl::Span<complex128> data) {
+  CHECK_GE(data.size(), fft_size);
+
+  const bool input_is_truncated = fft_type == FftType::IRFFT;
+
+  // Recursively visit each transform dimension to copy input values to the
+  // working data set. The base case handles inputs along the X axis.
+  bool input_is_zero = true;
+  const InputType* input_data = input_literal.data<InputType>().data();
+  auto base_case = [&](int64 axis, int64 dst_index, int64 src_index,
+                       bool within_src_bounds) {
+    if (axis == 0) {
+      // For IRFFT, the negavie frequencies are only needed for the sweep along
+      // the X axis, which is performed last. Leave this part of the working set
+      // uninitialized until then.
+      const int64 length = fft_lengths[axis];
+      const int64 ub = input_is_truncated ? (length / 2) + 1 : length;
+      for (int64 i = 0; i < ub; i++) {
+        complex128 value = InputType(0);
+        // Read input value only if the index is within bounds.
+        if (within_src_bounds && i < input_lengths[axis]) {
+          value = GetAs<complex128, InputType>(
+              input_data[src_index + i * input_strides[axis]]);
+          input_is_zero &= value == complex128(0.0, 0.0);
+        }
+        data[dst_index + i * fft_strides[axis]] = value;
+      }
+      return true;
+    }
+    return false;
+  };
+  GenerateIndices(fft_lengths, fft_strides, input_lengths, input_strides,
+                  fft_rank, 0, input_start, base_case);
+  return input_is_zero;
+}
+
+// Copies the result of the transform to the literal output. The sizes of the
+// transform and output must match.
+//
+// For RFFT transforms, we copy (length_x / 2) + 1 elements, where length_x is
+// the size of the full transform along the X axis (the most minor dimension).
+//
+// The output literal may have a rank higher than the rank of the transform.
+// Passed-in output_index value points to the first element of the output
+// literal to be filled in.
+//
+template <typename OutputType>
+void CopyDataToOutput(const absl::Span<complex128> data, int64 output_start,
+                      int64 fft_rank, FftType fft_type,
+                      const absl::Span<const int64> fft_lengths,
+                      const absl::Span<const int64> fft_strides,
+                      const absl::Span<const int64> output_lengths,
+                      const absl::Span<const int64> output_strides,
+                      Literal* output_literal) {
+  const bool output_is_truncated = fft_type == FftType::RFFT;
+
+  // Base case for recursive copy of the results to the output. The code avoids
+  // making a recursive call for each output element by handling axis 0 in the
+  // loop (as opposed to making "axis < 0" to be the base case).
+  OutputType* output_data = output_literal->data<OutputType>().data();
+  auto base_case = [&](int64 axis, int64 dst_index, int64 src_index,
+                       bool within_src_bounds) {
+    if (axis == 0) {
+      // Drop negative frequencies for RFFT.
+      const int64 length = fft_lengths[axis];
+      const int64 ub = output_is_truncated ? (length / 2) + 1 : length;
+      for (int64 i = 0; i < output_lengths[axis]; i++) {
+        OutputType value = OutputType(0);
+        // Read data only if the index is within bounds.
+        if (within_src_bounds && i < ub) {
+          value = GetAs<OutputType, complex128>(
+              data[src_index + i * fft_strides[axis]]);
+        }
+        output_data[dst_index + i * output_strides[axis]] = value;
+      }
+      return true;
+    }
+    return false;
+  };
+  GenerateIndices(output_lengths, output_strides, fft_lengths, fft_strides,
+                  fft_rank, output_start, 0, base_case);
+}
+
+// Determine the type to use with the CopyDataFromInput<> template above.
+bool CopyDataFromInput(const Literal& input_literal, int64 input_start,
+                       int64 fft_rank, FftType fft_type, int64 fft_size,
+                       const absl::Span<const int64> fft_lengths,
+                       const absl::Span<const int64> fft_strides,
+                       const absl::Span<const int64> input_lengths,
+                       const absl::Span<const int64> input_strides,
+                       absl::Span<complex128> data) {
+  const bool input_is_float = fft_type == FftType::RFFT;
+  if (input_is_float) {
+    return CopyDataFromInput<float>(
+        input_literal, input_start, fft_rank, fft_type, fft_size, fft_lengths,
+        fft_strides, input_lengths, input_strides, data);
+  } else {
+    return CopyDataFromInput<complex64>(
+        input_literal, input_start, fft_rank, fft_type, fft_size, fft_lengths,
+        fft_strides, input_lengths, input_strides, data);
+  }
+}
+
+// Determine the type to use with the CopyDataToOutput<> template above.
+void CopyDataToOutput(const absl::Span<complex128> data, int64 output_start,
+                      int64 fft_rank, FftType fft_type,
+                      const absl::Span<const int64> fft_lengths,
+                      const absl::Span<const int64> fft_strides,
+                      const absl::Span<const int64> output_lengths,
+                      const absl::Span<const int64> output_strides,
+                      Literal* output_literal) {
+  const bool output_is_float = fft_type == FftType::IRFFT;
+  if (output_is_float) {
+    CopyDataToOutput<float>(data, output_start, fft_rank, fft_type, fft_lengths,
+                            fft_strides, output_lengths, output_strides,
+                            output_literal);
+  } else {
+    CopyDataToOutput<complex64>(data, output_start, fft_rank, fft_type,
+                                fft_lengths, fft_strides, output_lengths,
+                                output_strides, output_literal);
+  }
+}
+
+Status CheckParameters(const Shape& input_shape, const Shape& output_shape,
+                       int64 fft_rank, FftType fft_type,
+                       const absl::Span<const int64> fft_lengths) {
+  // Check FFT parameters.
+  if (fft_rank <= 0) {
+    return InvalidArgument("Zero or negative FFT rank.");
+  }
+  if (*absl::c_min_element(fft_lengths) < 0) {
+    return InvalidArgument("Negative FFT length.");
+  }
+
+  // Check input-related values.
+  TF_CHECK_OK(ShapeUtil::ValidateShape(input_shape));
+  if (!input_shape.IsArray()) {
+    return Unimplemented("Only array input shapes are supported.");
+  }
+  auto input_elt_type = input_shape.element_type();
+  if (fft_type == FftType::RFFT && input_elt_type != PrimitiveType::F32) {
+    return InvalidArgument("Invalid input type: %d, must be %d (float).",
+                           input_elt_type, PrimitiveType::F32);
+  }
+  if (fft_type != FftType::RFFT && input_elt_type != PrimitiveType::C64) {
+    return InvalidArgument("Invalid input type: %d, must be %d (complex64).",
+                           input_elt_type, PrimitiveType::C64);
+  }
+  const int64 input_rank = input_shape.rank();
+  if (input_rank < fft_rank) {
+    return InvalidArgument("Input shape rank is smaller than FFT rank.");
+  }
+
+  // Check output-related values.
+  TF_CHECK_OK(ShapeUtil::ValidateShape(output_shape));
+  if (!output_shape.IsArray()) {
+    return Unimplemented("Only array output shapes are supported.");
+  }
+  auto output_elt_type = output_shape.element_type();
+  if (fft_type == FftType::IRFFT && output_elt_type != PrimitiveType::F32) {
+    return InvalidArgument("Invalid output type: %d, must be %d (float).",
+                           output_elt_type, PrimitiveType::F32);
+  }
+  if (fft_type != FftType::IRFFT && output_elt_type != PrimitiveType::C64) {
+    return InvalidArgument("Invalid output type: %d, must be %d (complex64).",
+                           output_elt_type, PrimitiveType::C64);
+  }
+  const int64 output_rank = output_shape.rank();
+  if (output_rank < fft_rank) {
+    return InvalidArgument("Output shape rank is smaller than FFT rank.");
+  }
+
+  // Consistency of input and output parameters.
+  if (input_rank != output_rank) {
+    return InvalidArgument(
+        "Ranks of input shape and output shape do not match.");
+  }
+  for (int64 dim = 0; dim < input_rank - fft_rank; dim++) {
+    if (ShapeUtil::GetDimension(input_shape, dim) !=
+        ShapeUtil::GetDimension(output_shape, dim)) {
+      return InvalidArgument(
+          "Higher dimension lengths of input shape and output shape do not "
+          "match.");
+    }
+  }
+
+  return Status::OK();
+}
+
+}  // namespace
+
+// Flexible but slow implementation of the discrete Fourier transform. All
+// transform types (FFT, IFFT, RFFT, and IRFFT) are supported, as well as the
+// arbitrary rank and length of each dimension of the transform, and arbitrary
+// layouts of the input and output literals.
+//
+// The input literal in operand 0 provides input data, which must be complex64
+// for FFT, IFFT, IRFFT transforms and float for RFFT. The transform is computed
+// over the innermost dimensions of the input, thus the rank of the input data
+// must be same as fft_rank or larger. The input is expected to provide Ni
+// values along each transform axis with one exception: for IRFFT, only
+// (N0 / 2) + 1 values are needed along the X axis (the innermost index). To
+// increase flexibility, this implementation can handle mismatches between the
+// input size and transform lengths by either dropping extra input values or
+// using zeroes in place of missing input values as necessary. If the input data
+// has rank higher than the transform, the transform is applied for each valid
+// combination of the higher-ranking indices.
+//
+// The output contains complex64 values for FFT, IFFT, RFFT, and float values
+// for IRFFT. The rank of the output as well as the sizes of the dimensions
+// above the rank of the transform must match those of the input. Sizes of the
+// output's "fft_rank" innermost dimensions are expected to match the length of
+// the transform along respective axes with one exception: for RFFT, the output
+// is trimmed along the X axis to have only (N0 / 2) + 1 values. In case the
+// length(s) mismatch, the FFT output is trimmed to fit into the provided output
+// shape, or the output is padded with zero values appropriately.
+//
+// For example, 2D FFT transform of size 16x16 applied to complex64[2][15][17]
+// input array will perform two transforms over the [][15][17] data in the sub
+// arrays [0][][] and [1][][], dropping the values along axis X and padding axis
+// Y with zeroes to create 16x16 working sets, and generating
+// complex64[2][16][16] output. 3D IRFFT transform of size 64x16x16 applied to
+// complex64[64][16][9] input array will use all input values and will produce
+// float[64][16][16] output.
+//
+// The implementation of the 1D transform is a straightforward loop nest. The
+// transforms of higher ranks apply sets of 1D transforms along each axis. For
+// example, the 2D transform is computed by applying 1D transforms to each
+// column followed by applying 1D transforms to each row.
+//
+// In general, a transform of rank n runs in O(N0*N1*...*Nn*(N0+N1+...+Nn))
+// time, where Ni is the length of the transform's i-th dimension. It is
+// possible to reduce the run time to O(N0*N1*...(log(N0)+log(N1)+...)) by
+// plugging in a more efficient 1D implementation.
+//
+Status HloEvaluator::HandleFft(HloInstruction* fft) {
+  const FftType fft_type = fft->fft_type();
+  std::vector<int64> fft_lengths = fft->fft_length();
+  const int64 fft_rank = fft_lengths.size();
+  const Literal& input_literal = GetEvaluatedLiteralFor(fft->operand(0));
+  const Shape& input_shape = input_literal.shape();
+  const Shape& output_shape = fft->shape();
+  Literal output_literal = Literal::CreateFromShape(output_shape);
+
+  // Make fft_lengths[0] the minor-most dimension.
+  absl::c_reverse(fft_lengths);
+
+  TF_RETURN_IF_ERROR(CheckParameters(input_shape, output_shape, fft_rank,
+                                     fft_type, fft_lengths));
+
+  const auto fft_strides = ComputeStrides(fft_lengths);
+
+  // Working set size.
+  const int64 fft_size = fft_strides[fft_rank];
+
+  if (fft_size > 0) {
+    // Linearized working data set.
+    std::vector<complex128> data(fft_size);
+
+    // Temporary buffer allocated once and used in 1D sweeps.
+    std::vector<complex128> buffer(*absl::c_max_element(fft_lengths));
+
+    // Sizes of each axis of input and output literals.
+    const auto input_lengths = GetDimensionLengths(input_literal);
+    const auto output_lengths = GetDimensionLengths(output_literal);
+
+    // Strides for generating linearized indices into multidimensional arrays.
+    const auto input_strides = ComputeStrides(input_lengths, input_literal);
+    const auto output_strides = ComputeStrides(output_lengths, output_literal);
+
+    // Visit all elements in the dimensions with ranks above the FFT rank. For
+    // each such element invoke the transform. Use separate indices for the
+    // input and the output to allow different layouts.
+    auto base_case = [&](int64 axis, int64 output_index, int64 input_index,
+                         bool within_src_bounds) {
+      if (axis == fft_rank - 1) {
+        // Base case: copy the data from the input literal, apply the
+        // transform, and copy the result to the output literal.
+        CHECK(within_src_bounds);
+        bool input_is_zero =
+            CopyDataFromInput(input_literal, input_index, fft_rank, fft_type,
+                              fft_size, fft_lengths, fft_strides, input_lengths,
+                              input_strides, absl::MakeSpan(data));
+        if (!input_is_zero) {
+          // Make 1D sweeps along each transform axis.
+          Sweep(fft_rank, fft_type, fft_lengths, fft_strides,
+                absl::MakeSpan(data), absl::MakeSpan(buffer));
+        }
+        CopyDataToOutput(absl::MakeSpan(data), output_index, fft_rank, fft_type,
+                         fft_lengths, fft_strides, output_lengths,
+                         output_strides, &output_literal);
+        return true;
+      }
+      return false;
+    };
+    GenerateIndices(output_lengths, output_strides, input_lengths,
+                    input_strides, input_shape.rank(), 0, 0, base_case);
+  }
+
+  evaluated_[fft] = std::move(output_literal);
+  return Status::OK();
+}
+
 // Returns an ShapeUtil::IndexIterationSpace that iterates over the output batch
 // dimensions while keeping the rest of the output dimensions clamped to 0.
 ShapeUtil::IndexIterationSpace IterationSpaceForOutputBatchIndices(
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
index 357975a131d..45b6a2754d6 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -204,6 +204,8 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
 
   Status HandleTuple(HloInstruction* tuple) override;
 
+  Status HandleFft(HloInstruction* fft) override;
+
   Status HandleGather(HloInstruction* gather) override;
 
   Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index eb0ed82eac8..68221c036b9 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -56,7 +56,7 @@ static std::array<bool, 2> use_bf16_params{true, false};
 // In bf16 mode, all f32 shapes are converted to bf16 before running.
 class HloEvaluatorTest : public HloTestBase {
  public:
-  HloEvaluatorTest() : use_bfloat16_(false) {}
+  HloEvaluatorTest() : use_bfloat16_(false) { InitializeFftData(); }
 
   StatusOr<Literal> Evaluate(
       absl::Span<const Literal* const> arg_literals = {}) {
@@ -130,11 +130,24 @@ class HloEvaluatorTest : public HloTestBase {
   }
 
  protected:
-  explicit HloEvaluatorTest(bool use_bfloat16) : use_bfloat16_(use_bfloat16) {}
+  explicit HloEvaluatorTest(bool use_bfloat16) : use_bfloat16_(use_bfloat16) {
+    InitializeFftData();
+  }
+
+  // Initializes data sets used in FFT tests below.
+  void InitializeFftData();
+
   HloEvaluator evaluator_;
 
   const bool use_bfloat16_;
   std::unique_ptr<HloModule> m_ = CreateNewVerifiedModule();
+
+  // Data sets used in FFT tests below.
+  ErrorSpec fft_error_ = ErrorSpec(1e-4, 1e-5);
+  Literal fft_c64x2x4x8_;
+  Literal fft_c64x2x4x8_1d_;
+  Literal fft_c64x2x4x8_2d_;
+  Literal fft_c64x2x4x8_3d_;
 };
 
 // Lets you write TEST_Ps that run twice, once with and once without bf16.
@@ -339,6 +352,13 @@ TEST_P(HloEvaluatorBf16Test, DoesAbsR1WithZeroSize) {
   auto expected = LiteralUtil::CreateR1<float>({});
   TestUnaryOp(HloOpcode::kAbs, std::move(expected), std::move(operand));
 }
+
+TEST_F(HloEvaluatorTest, DoesAbsC128) {
+  auto x = LiteralUtil::CreateR0<complex128>({1, 2});
+  auto expected_real = LiteralUtil::CreateR0<double>(2.23607);
+  TestUnaryOp(HloOpcode::kAbs, std::move(expected_real), std::move(x), 3e-06);
+}
+
 TEST_F(HloEvaluatorTest, DoesNegateR2) {
   auto operand = LiteralUtil::CreateR2<int32>(
       {{0, std::numeric_limits<int32>::min()}, {-1, 4}});
@@ -1423,6 +1443,1015 @@ TEST_P(HloEvaluatorBf16Test, Conv2DGroupedConvolution) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
+// Initialization of data sets for FFT tests:
+
+void HloEvaluatorTest::InitializeFftData() {
+  // clang-format off
+  fft_c64x2x4x8_ = LiteralUtil::CreateR3<complex64>({
+    {{{0.0, 0.0}, {1.0, 0.0}, {2.0, 0.0}, {3.0, 0.0},
+      {4.0, 0.0}, {5.0, 0.0}, {6.0, 0.0}, {7.0, 0.0}},
+     {{0.0, 0.0}, {0.0, 1.0}, {0.0, 2.0}, {0.0, 3.0},
+      {0.0, 4.0}, {0.0, 5.0}, {0.0, 6.0}, {0.0, 7.0}},
+     {{0.0, 7.0}, {1.0, 6.0}, {2.0, 5.0}, {3.0, 4.0},
+      {4.0, 3.0}, {5.0, 2.0}, {6.0, 1.0}, {7.0, 0.0}},
+     {{7.0, 0.0}, {6.0, 1.0}, {5.0, 2.0}, {4.0, 3.0},
+      {3.0, 4.0}, {2.0, 5.0}, {1.0, 6.0}, {0.0, 7.0}}},
+    {{{-4.0, 0.0}, {-3.0, 0.0}, {-2.0, 0.0}, {-1.0, 0.0},
+      {1.0, 0.0}, {2.0, 0.0}, {3.0, 0.0}, {4.0, 0.0}},
+     {{0.0, -4.0}, {0.0, -3.0}, {0.0, -2.0}, {0.0, -1.0},
+      {0.0, 1.0}, {0.0, 2.0}, {0.0, 3.0}, {0.0, 4.0}},
+     {{3.5, 3.5}, {-1.707107, -0.707107}, {-1.0, -0.0}, {-0.707107, 0.292893},
+      {-0.5, 0.5}, {-0.292893, 0.707107}, {0.0, 1.0}, {0.707107, 1.707107}},
+     {{3.5, 3.5}, {1.707107, 0.707107}, {1.0, 0.0}, {0.707107, -0.292893},
+      {0.5, -0.5}, {0.292893, -0.707107}, {-0.0, -1.0}, {-0.707107, -1.707107}}}
+  });
+  fft_c64x2x4x8_1d_ = LiteralUtil::CreateR3<complex64>({
+    {{{28.0, 0.0}, {-4.0, 9.656854}, {-4.0, 4.0}, {-4.0, 1.656854},
+      {-4.0, 0.0}, {-4.0, -1.656854}, {-4.0, -4.0}, {-4.0, -9.656854}},
+     {{0.0, 28.0}, {-9.656854, -4.0}, {-4.0, -4.0}, {-1.656854, -4.0},
+      {0.0, -4.0}, {1.656854, -4.0}, {4.0, -4.0}, {9.656854, -4.0}},
+     {{28.0, 28.0}, {5.656854, 13.656854}, {0.0, 8.0}, {-2.343146, 5.656854},
+      {-4.0, 4.0}, {-5.656854, 2.343146}, {-8.0, -0.0}, {-13.656854, -5.656854}},  // NOLINT
+     {{28.0, 28.0}, {-5.656854, -13.656854}, {-0.0, -8.0}, {2.343146, -5.656854},  // NOLINT
+      {4.0, -4.0}, {5.656854, -2.343146}, {8.0, 0.0}, {13.656854, 5.656854}}},
+    {{{0.0, 0.0}, {-5.0, 12.071068}, {-4.0, 4.0}, {-5.0, 2.071068},
+      {-4.0, 0.0}, {-5.0, -2.071068}, {-4.0, -4.0}, {-5.0, -12.071068}},
+     {{0.0, 0.0}, {-12.071068, -5.0}, {-4.0, -4.0}, {-2.071068, -5.0},
+      {0.0, -4.0}, {2.071068, -5.0}, {4.0, -4.0}, {12.071068, -5.0}},
+     {{0.0, 7.0}, {1.0, 6.0}, {2.0, 5.0}, {3.0, 4.0},
+      {4.0, 3.0}, {5.0, 2.0}, {6.0, 1.0}, {7.0, 0.0}},
+     {{7.0, 0.0}, {6.0, 1.0}, {5.0, 2.0}, {4.0, 3.0},
+      {3.0, 4.0}, {2.0, 5.0}, {1.0, 6.0}, {0.0, 7.0}}}
+  });
+  fft_c64x2x4x8_2d_ = LiteralUtil::CreateR3<complex64>({
+    {{{84.0, 84.0}, {-13.656854, 5.656854}, {-8.0, 0.0}, {-5.656854, -2.343146},
+      {-4.0, -4.0}, {-2.343146, -5.656854}, {0.0, -8.0}, {5.656854, -13.656854}},  // NOLINT
+     {{0.0, 0.0}, {0.0, -0.0}, {0.0, 0.0}, {0.0, 0.0},
+      {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+     {{28.0, -28.0}, {16.970562, 40.970562}, {0.0, 24.0}, {-7.029438, 16.970562},      // NOLINT
+      {-12.0, 12.0}, {-16.970562, 7.029438}, {-24.0, 0.0}, {-40.970562, -16.970562}},  // NOLINT
+     {{0.0, -56.0}, {-19.313708, -8.0}, {-8.0, -8.0}, {-3.313708, -8.0},
+      {0.0, -8.0}, {3.313708, -8.0}, {8.0, -8.0}, {19.313708, -8.0}}},
+    {{{7.0, 7.0}, {-10.071068, 14.071068}, {-1.0, 7.0}, {-0.071068, 4.071068},
+      {3.0, 3.0}, {4.071068, -0.071068}, {7.0, -1.0}, {14.071068, -10.071068}},
+     {{0.0, 0.0}, {-12.0, 24.142136}, {-12.0, 8.0}, {-16.0, 4.142136},
+      {-16.0, 0.0}, {-20.0, -4.142136}, {-20.0, -8.0}, {-24.0, -24.142136}},
+     {{-7.0, 7.0}, {2.071068, 22.071068}, {-3.0, 11.0}, {-3.928932, 8.071068},
+      {-3.0, 3.0}, {-4.071068, -0.071068}, {-3.0, -5.0}, {-10.071068, -14.071068}},  // NOLINT
+     {{0.0, -14.0}, {0.0, -12.0}, {0.0, -10.0}, {0.0, -8.0},
+      {0.0, -6.0}, {0.0, -4.0}, {0.0, -2.0}, {0.0, 0.0}}}
+  });
+  fft_c64x2x4x8_3d_ = LiteralUtil::CreateR3<complex64>({
+    {{{91.0, 91.0}, {-23.727922, 19.727922}, {-9.0, 7.0}, {-5.727922, 1.727922},
+      {-1.0, -1.0}, {1.727922, -5.727922}, {7.0, -9}, {19.727922, -23.727922}},
+     {{0.0, 0.0}, {-12.0, 24.142136}, {-12.0, 8.0}, {-16.0, 4.142136},
+      {-16.0, 0.0}, {-20.0, -4.142136}, {-20.0, -8.0}, {-24.0, -24.142136}},
+     {{21.0, -21.0}, {19.041630, 63.041630}, {-3.0, 35.0}, {-10.958370, 25.041630},     // NOLINT
+      {-15.0, 15.0}, {-21.041630, 6.958370}, {-27.0, -5.0}, {-51.041630, -31.041630}},  // NOLINT
+     {{0.0, -70.0}, {-19.313708, -20.0}, {-8.0, -18.0}, {-3.313708, -16.0},
+      {0.0, -14.0}, {3.313708, -12.0}, {8.0, -10.0}, {19.313708, -8.0}}},
+    {{{77.0, 77.0}, {-3.585786, -8.414214}, {-7.0, -7.0}, {-5.585786, -6.414214},   // NOLINT
+      {-7.0, -7.0}, {-6.414214, -5.585786}, {-7.0, -7.0}, {-8.414214, -3.585786}},  // NOLINT
+     {{0.0, 0.0}, {12.0, -24.142136}, {12.0, -8.0}, {16.0, -4.142136},
+      {16.0, 0.0}, {20.0, 4.142136}, {20.0, 8.0}, {24.0, 24.142136}},
+     {{35.0, -35.0}, {14.899494, 18.899494}, {3.0, 13.0}, {-3.100506, 8.899494},
+      {-9.0, 9.0}, {-12.899494, 7.100506}, {-21.0, 5.0}, {-30.899494, -2.899494}},  // NOLINT
+     {{0.0, -42.0}, {-19.313708, 4.0}, {-8.0, 2.0}, {-3.313708, 0.0},
+      {0.0, -2.0}, {3.313708, -4.0}, {8.0, -6.0}, {19.313708, -8.0}}}
+  });
+  // clang-format on
+}
+
+// Simple FFT tests:
+
+TEST_F(HloEvaluatorTest, 1D_FFT_4_on_c64x4) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[4] parameter(0)
+  ROOT fft = c64[4] fft(operand), fft_type=FFT, fft_length={4}
+}
+)";
+  auto input = LiteralUtil::CreateR1<complex64>(
+      {{1.0, 0.0}, {2.0, 0.0}, {3.0, 0.0}, {4.0, 0.0}});
+  auto expected = LiteralUtil::CreateR1<complex64>(
+      {{10.0, 0.0}, {-2.0, 2.0}, {-2.0, 0.0}, {-2.0, -2.0}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 1D_IFFT_4_on_c64x4) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[4] parameter(0)
+  ROOT ifft = c64[4] fft(operand), fft_type=IFFT, fft_length={4}
+}
+)";
+  auto input = LiteralUtil::CreateR1<complex64>(
+      {{10.0, 0.0}, {-2.0, 2.0}, {-2.0, 0.0}, {-2.0, -2.0}});
+  auto expected = LiteralUtil::CreateR1<complex64>(
+      {{1.0, 0.0}, {2.0, 0.0}, {3.0, 0.0}, {4.0, 0.0}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 1D_RFFT_4_on_f32x4) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = f32[4] parameter(0)
+  ROOT rfft = c64[3] fft(operand), fft_type=RFFT, fft_length={4}
+}
+)";
+  auto input = LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0, 4.0});
+  auto expected =
+      LiteralUtil::CreateR1<complex64>({{10.0, 0.0}, {-2.0, 2.0}, {-2.0, 0.0}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 1D_IRFFT_4_on_c64x3) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[3] parameter(0)
+  ROOT irfft = f32[4] fft(operand), fft_type=IRFFT, fft_length={4}
+}
+)";
+  auto input =
+      LiteralUtil::CreateR1<complex64>({{10.0, 0.0}, {-2.0, 2.0}, {-2.0, 0.0}});
+  auto expected = LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0, 4.0});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+// 1D FFT tests:
+
+TEST_F(HloEvaluatorTest, 1D_FFT_8_on_c64x2x4x8) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[2, 4, 8] parameter(0)
+  ROOT fft = c64[2, 4, 8] fft(operand), fft_type=FFT, fft_length={8}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&fft_c64x2x4x8_}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), fft_c64x2x4x8_1d_.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(fft_c64x2x4x8_1d_, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 1D_IFFT_8_on_c64x2x4x8) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[2, 4, 8] parameter(0)
+  ROOT ifft = c64[2, 4, 8] fft(operand), fft_type=IFFT, fft_length={8}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&fft_c64x2x4x8_1d_}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), fft_c64x2x4x8_.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(fft_c64x2x4x8_, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 1D_RFFT_8_on_f32x8) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = f32[8] parameter(0)
+  ROOT rfft = c64[5] fft(operand), fft_type=RFFT, fft_length={8}
+}
+)";
+  auto input =
+      LiteralUtil::CreateR1<float>({1.8, 2.7, 3.6, 4.5, 5.4, 6.3, 7.2, 8.1});
+  auto expected = LiteralUtil::CreateR1<complex64>({{39.6, 0.0},
+                                                    {-3.6, 8.691169},
+                                                    {-3.6, 3.6},
+                                                    {-3.6, 1.491169},
+                                                    {-3.6, 0.0}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 1D_IRFFT_8_on_c64x5) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[5] parameter(0)
+  ROOT irfft = f32[8] fft(operand), fft_type=IRFFT, fft_length={8}
+}
+)";
+  auto input = LiteralUtil::CreateR1<complex64>({{39.6, 0.0},
+                                                 {-3.6, 8.691169},
+                                                 {-3.6, 3.6},
+                                                 {-3.6, 1.491169},
+                                                 {-3.6, 0.0}});
+  auto expected =
+      LiteralUtil::CreateR1<float>({1.8, 2.7, 3.6, 4.5, 5.4, 6.3, 7.2, 8.1});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 1D_RFFT_9_on_f32x9) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = f32[9] parameter(0)
+  ROOT rfft = c64[5] fft(operand), fft_type=RFFT, fft_length={9}
+}
+)";
+  auto input = LiteralUtil::CreateR1<float>(
+      {1.8, 2.7, 3.6, 4.5, 5.4, 6.3, 7.2, 8.1, 9.9});
+  auto expected = LiteralUtil::CreateR1<complex64>({{49.5, 0.0},
+                                                    {-3.360560, 11.705792},
+                                                    {-3.893717, 5.712929},
+                                                    {-4.5, 3.117691},
+                                                    {-4.895723, 1.021942}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 1D_IRFFT_9_on_c64x5) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[5] parameter(0)
+  ROOT irfft = f32[9] fft(operand), fft_type=IRFFT, fft_length={9}
+}
+)";
+  auto input = LiteralUtil::CreateR1<complex64>({{49.5, 0.0},
+                                                 {-3.360560, 11.705792},
+                                                 {-3.893717, 5.712929},
+                                                 {-4.5, 3.117691},
+                                                 {-4.895723, 1.021942}});
+  auto expected = LiteralUtil::CreateR1<float>(
+      {1.8, 2.7, 3.6, 4.5, 5.4, 6.3, 7.2, 8.1, 9.9});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+// 2D FFT tests:
+
+TEST_F(HloEvaluatorTest, 2D_FFT_4x8_on_c64x2x4x8) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[2, 4, 8] parameter(0)
+  ROOT fft = c64[2, 4, 8] fft(operand), fft_type=FFT, fft_length={4, 8}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&fft_c64x2x4x8_}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), fft_c64x2x4x8_2d_.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(fft_c64x2x4x8_2d_, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 2D_IFFT_4x8_on_c64x2x4x8) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[2, 4, 8] parameter(0)
+  ROOT ifft = c64[2, 4, 8] fft(operand), fft_type=IFFT, fft_length={4, 8}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&fft_c64x2x4x8_2d_}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), fft_c64x2x4x8_.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(fft_c64x2x4x8_, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 2D_RFFT_3x8_on_f32x3x8) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = f32[3, 8] parameter(0)
+  ROOT rfft = c64[3, 5] fft(operand), fft_type=RFFT, fft_length={3, 8}
+}
+)";
+  auto input =
+      LiteralUtil::CreateR2<float>({{1.8, 2.7, 3.6, 4.5, 5.4, 6.3, 7.2, 8.1},
+                                    {8.1, 7.2, 6.3, 5.4, 4.5, 3.6, 2.7, 1.8},
+                                    {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8}});
+  auto expected = LiteralUtil::CreateR2<complex64>({{{118.8, 0.0},
+                                                     {-4.4, 10.622540},
+                                                     {-4.4, 4.4},
+                                                     {-4.4, 1.822540},
+                                                     {-4.4, 0.0}},
+                                                    {{0.0, 0.0},
+                                                     {-19.926162, 0.797280},
+                                                     {-10.128203, -3.728203},
+                                                     {-6.069756, -5.602720},
+                                                     {-3.2, -6.928203}},
+                                                    {{0.0, 0.0},
+                                                     {13.526162, 14.653687},
+                                                     {3.728203, 10.128203},
+                                                     {-0.330244, 8.253687},
+                                                     {-3.2, 6.928203}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 2D_IRFFT_3x8_on_c64x3x5) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[3, 5] parameter(0)
+  ROOT irfft = f32[3, 8] fft(operand), fft_type=IRFFT, fft_length={3, 8}
+}
+)";
+  auto input = LiteralUtil::CreateR2<complex64>({{{118.8, 0.0},
+                                                  {-4.4, 10.622540},
+                                                  {-4.4, 4.4},
+                                                  {-4.4, 1.822540},
+                                                  {-4.4, 0.0}},
+                                                 {{0.0, 0.0},
+                                                  {-19.926162, 0.797280},
+                                                  {-10.128203, -3.728203},
+                                                  {-6.069756, -5.602720},
+                                                  {-3.2, -6.928203}},
+                                                 {{0.0, 0.0},
+                                                  {13.526162, 14.653687},
+                                                  {3.728203, 10.128203},
+                                                  {-0.330244, 8.253687},
+                                                  {-3.2, 6.928203}}});
+  auto expected =
+      LiteralUtil::CreateR2<float>({{1.8, 2.7, 3.6, 4.5, 5.4, 6.3, 7.2, 8.1},
+                                    {8.1, 7.2, 6.3, 5.4, 4.5, 3.6, 2.7, 1.8},
+                                    {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 2D_RFFT_3x9_on_f32x3x9) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = f32[3, 9] parameter(0)
+  ROOT rfft = c64[3, 5] fft(operand), fft_type=RFFT, fft_length={3, 9}
+}
+)";
+  auto input = LiteralUtil::CreateR2<float>(
+      {{1.9, 2.8, 3.7, 4.6, 5.5, 6.4, 7.3, 8.2, 9.1},
+       {9.1, 8.2, 7.3, 6.4, 5.5, 4.6, 3.7, 2.8, 1.9},
+       {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9}});
+  auto expected = LiteralUtil::CreateR2<complex64>({{{148.5, 0.0},
+                                                     {-4.95, 13.600013},
+                                                     {-4.95, 5.899180},
+                                                     {-4.95, 2.857884},
+                                                     {-4.95, 0.872819}},
+                                                    {{0.0, 0.0},
+                                                     {-25.014467, 2.096690},
+                                                     {-12.888800, -3.503916},
+                                                     {-8.1, -5.715768},
+                                                     {-4.974333, -7.159452}},
+                                                    {{0.0, 0.0},
+                                                     {17.814467, 17.685147},
+                                                     {5.688800, 12.084542},
+                                                     {0.9, 9.872690},
+                                                     {-2.225667, 8.429006}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 2D_IRFFT_3x9_on_c64x3x5) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[3, 5] parameter(0)
+  ROOT irfft = f32[3, 9] fft(operand), fft_type=IRFFT, fft_length={3, 9}
+}
+)";
+  auto input = LiteralUtil::CreateR2<complex64>({{{148.5, 0.0},
+                                                  {-4.95, 13.600013},
+                                                  {-4.95, 5.899180},
+                                                  {-4.95, 2.857884},
+                                                  {-4.95, 0.872819}},
+                                                 {{0.0, 0.0},
+                                                  {-25.014467, 2.096690},
+                                                  {-12.888800, -3.503916},
+                                                  {-8.1, -5.715768},
+                                                  {-4.974333, -7.159452}},
+                                                 {{0.0, 0.0},
+                                                  {17.814467, 17.685147},
+                                                  {5.688800, 12.084542},
+                                                  {0.9, 9.872690},
+                                                  {-2.225667, 8.429006}}});
+  auto expected = LiteralUtil::CreateR2<float>(
+      {{1.9, 2.8, 3.7, 4.6, 5.5, 6.4, 7.3, 8.2, 9.1},
+       {9.1, 8.2, 7.3, 6.4, 5.5, 4.6, 3.7, 2.8, 1.9},
+       {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+// 3D FFT tests:
+
+TEST_F(HloEvaluatorTest, 3D_FFT_2x4x8_on_c64x2x4x8) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[2, 4, 8] parameter(0)
+  ROOT fft = c64[2, 4, 8] fft(operand), fft_type=FFT, fft_length={2, 4, 8}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&fft_c64x2x4x8_}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), fft_c64x2x4x8_3d_.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(fft_c64x2x4x8_3d_, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 3D_IFFT_2x4x8_on_c64x2x4x8) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[2, 4, 8] parameter(0)
+  ROOT ifft = c64[2, 4, 8] fft(operand), fft_type=IFFT, fft_length={2, 4, 8}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&fft_c64x2x4x8_3d_}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), fft_c64x2x4x8_.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(fft_c64x2x4x8_, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 3D_RFFT_3x3x4_on_f32x3x3x4) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = f32[3, 3, 4] parameter(0)
+  ROOT rfft = c64[3, 3, 3] fft(operand), fft_type=RFFT, fft_length={3, 3, 4}
+}
+)";
+  auto input = LiteralUtil::CreateR3<float>(
+      {{{1.8, 2.7, 3.6, 4.5}, {8.1, 7.2, 6.3, 5.4}, {1.1, 2.2, 3.3, 4.4}},
+       {{5.4, 6.3, 7.2, 8.1}, {4.5, 3.6, 2.7, 1.8}, {5.5, 6.6, 7.7, 8.8}},
+       {{-1.8, -2.7, -3.6, -4.5},
+        {-5.4, -6.3, -7.2, -8.1},
+        {1.9, 2.9, 3.9, 4.9}}});
+  auto expected = LiteralUtil::CreateR3<complex64>(
+      {{{{92.8, 0.0}, {-2.8, 2.8}, {-2.8, 0.0}},
+        {{-5.9, 35.160631}, {-11.519100, -8.919100}, {-1.3, -10.219100}},
+        {{-5.9, -35.160631}, {8.919100, 11.519100}, {-1.3, 10.219100}}},
+       {{{29.5, -81.579593}, {1.390897, 5.190897}, {-1.9, 3.290897}},
+        {{-25.1, -49.017038}, {1.044486, 4.844486}, {-1.9, 2.944486}},
+        {{11.8, 27.712813}, {1.517691, 4.717691}, {-1.6, 3.117691}}},
+       {{{29.5, 81.579593}, {-5.190897, -1.390897}, {-1.9, -3.290897}},
+        {{11.8, -27.712813}, {-4.717691, -1.517691}, {-1.6, -3.117691}},
+        {{-25.1, 49.017038}, {-4.844486, -1.044486}, {-1.9, -2.944486}}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 3D_IRFFT_3x3x4_on_c64x3x3x3) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[3, 3, 3] parameter(0)
+  ROOT irfft = f32[3, 3, 4] fft(operand), fft_type=IRFFT, fft_length={3, 3, 4}
+}
+)";
+  auto input = LiteralUtil::CreateR3<complex64>(
+      {{{{92.8, 0.0}, {-2.8, 2.8}, {-2.8, 0.0}},
+        {{-5.9, 35.160631}, {-11.519100, -8.919100}, {-1.3, -10.219100}},
+        {{-5.9, -35.160631}, {8.919100, 11.519100}, {-1.3, 10.219100}}},
+       {{{29.5, -81.579593}, {1.390897, 5.190897}, {-1.9, 3.290897}},
+        {{-25.1, -49.017038}, {1.044486, 4.844486}, {-1.9, 2.944486}},
+        {{11.8, 27.712813}, {1.517691, 4.717691}, {-1.6, 3.117691}}},
+       {{{29.5, 81.579593}, {-5.190897, -1.390897}, {-1.9, -3.290897}},
+        {{11.8, -27.712813}, {-4.717691, -1.517691}, {-1.6, -3.117691}},
+        {{-25.1, 49.017038}, {-4.844486, -1.044486}, {-1.9, -2.944486}}}});
+  auto expected = LiteralUtil::CreateR3<float>(
+      {{{1.8, 2.7, 3.6, 4.5}, {8.1, 7.2, 6.3, 5.4}, {1.1, 2.2, 3.3, 4.4}},
+       {{5.4, 6.3, 7.2, 8.1}, {4.5, 3.6, 2.7, 1.8}, {5.5, 6.6, 7.7, 8.8}},
+       {{-1.8, -2.7, -3.6, -4.5},
+        {-5.4, -6.3, -7.2, -8.1},
+        {1.9, 2.9, 3.9, 4.9}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 3D_RFFT_3x3x5_on_f32x3x3x5) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = f32[3, 3, 5] parameter(0)
+  ROOT rfft = c64[3, 3, 3] fft(operand), fft_type=RFFT, fft_length={3, 3, 5}
+}
+)";
+  auto input = LiteralUtil::CreateR3<float>({{{1.8, 2.7, 3.6, 4.5, 5.4},
+                                              {8.1, 7.2, 6.3, 5.4, 4.5},
+                                              {1.1, 2.2, 3.3, 4.4, 5.5}},
+                                             {{5.4, 6.3, 7.2, 8.1, 9.0},
+                                              {4.5, 3.6, 2.7, 1.8, 0.9},
+                                              {5.5, 6.6, 7.7, 8.8, 9.9}},
+                                             {{-1.8, -2.7, -3.6, -4.5, -5.4},
+                                              {-5.4, -6.3, -7.2, -8.1, -9.0},
+                                              {1.9, 2.9, 3.9, 4.9, 5.9}}});
+  auto expected = LiteralUtil::CreateR3<complex64>(
+      {{{{119.5, 0.0}, {-3.5, 4.817337}, {-3.5, 1.137219}},
+        {{-5.75, 56.724664}, {-19.206730, -10.537254}, {-5.775483, -12.245880}},
+        {{-5.75, -56.724664}, {15.956730, 15.010495}, {2.525483, 13.301869}}},
+       {{{39.25, -106.088112}, {3.286913, 7.382528}, {-1.038404, 4.885305}},
+        {{-29.0, -64.951905}, {2.690922, 6.949515}, {-1.179098, 4.452292}},
+        {{16.75, 30.743902}, {3.363918, 6.649878}, {-0.733751, 4.546954}}},
+       {{{39.25, 106.088112}, {-8.036913, -0.844714}, {-3.711596, -3.341936}},
+        {{16.75, -30.743902}, {-7.363918, -1.144350}, {-3.266249, -3.247275}},
+        {{-29.0, 64.951905}, {-7.440922, -0.411701}, {-3.570902, -2.908924}}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 3D_IRFFT_3x3x5_on_c64x3x3x3) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[3, 3, 3] parameter(0)
+  ROOT irfft = f32[3, 3, 5] fft(operand), fft_type=IRFFT, fft_length={3, 3, 5}
+}
+)";
+  auto input = LiteralUtil::CreateR3<complex64>(
+      {{{{119.5, 0.0}, {-3.5, 4.817337}, {-3.5, 1.137219}},
+        {{-5.75, 56.724664}, {-19.206730, -10.537254}, {-5.775483, -12.245880}},
+        {{-5.75, -56.724664}, {15.956730, 15.010495}, {2.525483, 13.301869}}},
+       {{{39.25, -106.088112}, {3.286913, 7.382528}, {-1.038404, 4.885305}},
+        {{-29.0, -64.951905}, {2.690922, 6.949515}, {-1.179098, 4.452292}},
+        {{16.75, 30.743902}, {3.363918, 6.649878}, {-0.733751, 4.546954}}},
+       {{{39.25, 106.088112}, {-8.036913, -0.844714}, {-3.711596, -3.341936}},
+        {{16.75, -30.743902}, {-7.363918, -1.144350}, {-3.266249, -3.247275}},
+        {{-29.0, 64.951905}, {-7.440922, -0.411701}, {-3.570902, -2.908924}}}});
+  auto expected = LiteralUtil::CreateR3<float>({{{1.8, 2.7, 3.6, 4.5, 5.4},
+                                                 {8.1, 7.2, 6.3, 5.4, 4.5},
+                                                 {1.1, 2.2, 3.3, 4.4, 5.5}},
+                                                {{5.4, 6.3, 7.2, 8.1, 9.0},
+                                                 {4.5, 3.6, 2.7, 1.8, 0.9},
+                                                 {5.5, 6.6, 7.7, 8.8, 9.9}},
+                                                {{-1.8, -2.7, -3.6, -4.5, -5.4},
+                                                 {-5.4, -6.3, -7.2, -8.1, -9.0},
+                                                 {1.9, 2.9, 3.9, 4.9, 5.9}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+// FFT tests with non-default data layout:
+
+TEST_F(HloEvaluatorTest, 1D_FFT_8_on_c64x2x4x8_with_layout) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[2, 4, 8]{0, 2, 1} parameter(0)
+  ROOT fft = c64[2, 4, 8]{1, 2, 0} fft(operand), fft_type=FFT, fft_length={8}
+}
+)";
+  auto input = fft_c64x2x4x8_.Relayout(LayoutUtil::MakeLayout({0, 2, 1}));
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), fft_c64x2x4x8_1d_.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(fft_c64x2x4x8_1d_, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 2D_FFT_4x8_on_c64x2x4x8_with_layout) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[2, 4, 8]{2, 0, 1} parameter(0)
+  ROOT fft = c64[2, 4, 8]{1, 0, 2} fft(operand), fft_type=FFT, fft_length={4, 8}
+}
+)";
+  auto input = fft_c64x2x4x8_.Relayout(LayoutUtil::MakeLayout({2, 0, 1}));
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), fft_c64x2x4x8_2d_.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(fft_c64x2x4x8_2d_, result, fft_error_));
+}
+
+TEST_F(HloEvaluatorTest, 3D_FFT_2x4x8_on_c64x2x4x8_with_layout) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[2, 4, 8]{1, 2, 0} parameter(0)
+  ROOT fft =
+    c64[2, 4, 8]{0, 2, 1} fft(operand), fft_type=FFT, fft_length={2, 4, 8}
+}
+)";
+  auto input = fft_c64x2x4x8_.Relayout(LayoutUtil::MakeLayout({1, 2, 0}));
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), fft_c64x2x4x8_3d_.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(fft_c64x2x4x8_3d_, result, fft_error_));
+}
+
+// FFT tests with unusual parameters:
+
+// Zero-length transform.
+TEST_F(HloEvaluatorTest, 1D_FFT_0_on_c64x1x1x1x1) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[1, 1, 1, 1] parameter(0)
+  ROOT fft = c64[1, 1, 1, 1] fft(operand), fft_type=FFT, fft_length={0}
+}
+)";
+  auto input = LiteralUtil::CreateR4<complex64>({{{{{42.24, 24.42}}}}});
+  auto expected = LiteralUtil::CreateR4<complex64>({{{{{0.0, 0.0}}}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+// Zero-length axis.
+TEST_F(HloEvaluatorTest, 1D_FFT_1_on_c64x1x1x1x0) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[1, 1, 1, 0] parameter(0)
+  ROOT fft = c64[1, 1, 1, 0] fft(operand), fft_type=FFT, fft_length={1}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input,
+      LiteralUtil::CreateR4<complex64>({{{{}}}}).Reshape({1, 1, 1, 0}));
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), input.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(input, result, fft_error_));
+}
+
+// Some/all dimensions have length 1.
+TEST_F(HloEvaluatorTest, 1D_FFT_1_on_c64x1x1x1x1) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[1, 1, 1, 1] parameter(0)
+  ROOT fft = c64[1, 1, 1, 1] fft(operand), fft_type=FFT, fft_length={1}
+}
+)";
+  auto input = LiteralUtil::CreateR4<complex64>({{{{{42.24, 24.42}}}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), input.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(input, result, fft_error_));
+}
+
+// Zero-length transform.
+TEST_F(HloEvaluatorTest, 3D_FFT_1x0x1_on_c64x1x1x1x1) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[1, 1, 1, 1] parameter(0)
+  ROOT fft = c64[1, 1, 1, 1] fft(operand), fft_type=FFT, fft_length={1, 0, 1}
+}
+)";
+  auto input = LiteralUtil::CreateR4<complex64>({{{{{42.24, 24.42}}}}});
+  auto expected = LiteralUtil::CreateR4<complex64>({{{{{0.0, 0.0}}}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+// Zero-length axis.
+TEST_F(HloEvaluatorTest, 3D_FFT_1x1x1_on_c64x0x1x0x1) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[0, 1, 0, 1] parameter(0)
+  ROOT fft = c64[0, 1, 0, 1] fft(operand), fft_type=FFT, fft_length={1, 1, 1}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input,
+      LiteralUtil::CreateR4<complex64>({{{{}}}}).Reshape({0, 1, 0, 1}));
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), input.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(input, result, fft_error_));
+}
+
+// Some/all dimensions have length 1.
+TEST_F(HloEvaluatorTest, 3D_FFT_1x1x1_on_c64x1x1x1x1) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[1, 1, 1, 1] parameter(0)
+  ROOT fft = c64[1, 1, 1, 1] fft(operand), fft_type=FFT, fft_length={1, 1, 1}
+}
+)";
+  auto input = LiteralUtil::CreateR4<complex64>({{{{{42.24, 24.42}}}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), input.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(input, result, fft_error_));
+}
+
+// Some/all dimensions have length 1.
+TEST_F(HloEvaluatorTest, 3D_FFT_3x1x1_on_c64x1x3x1x1) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[1, 3, 1, 1] parameter(0)
+  ROOT fft = c64[1, 3, 1, 1] fft(operand), fft_type=FFT, fft_length={3, 1, 1}
+}
+)";
+  auto input = LiteralUtil::CreateR4<complex64>(
+      {{{{{42.24, 24.42}}}, {{{-42.24, 24.42}}}, {{{42.24, -24.42}}}}});
+  auto expected =
+      LiteralUtil::CreateR4<complex64>({{{{{42.24, 24.42}}},
+                                         {{{84.5367, 97.5818}}},
+                                         {{{-0.0566792, -48.7418}}}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+// Some/all dimensions have length 1.
+TEST_F(HloEvaluatorTest, 3D_IFFT_3x1x1_on_c64x1x3x1x1) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[1, 3, 1, 1] parameter(0)
+  ROOT ifft = c64[1, 3, 1, 1] fft(operand), fft_type=IFFT, fft_length={3, 1, 1}
+}
+)";
+  auto input = LiteralUtil::CreateR4<complex64>({{{{{42.24, 24.42}}},
+                                                  {{{84.5367, 97.5818}}},
+                                                  {{{-0.0566792, -48.7418}}}}});
+  auto expected = LiteralUtil::CreateR4<complex64>(
+      {{{{{42.24, 24.42}}}, {{{-42.24, 24.42}}}, {{{42.24, -24.42}}}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+// Odd transform length.
+TEST_F(HloEvaluatorTest, 1D_FFT_5_on_c64x5) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[5] parameter(0)
+  ROOT fft = c64[5] fft(operand), fft_type=FFT, fft_length={5}
+}
+)";
+  auto input = LiteralUtil::CreateR1<complex64>(
+      {{1.0, 5.0}, {2.0, 4.0}, {3.0, 3.0}, {4.0, 2.0}, {5.0, 1.0}});
+  auto expected = LiteralUtil::CreateR1<complex64>({{15.0, 15.0},
+                                                    {0.940955, 5.94095},
+                                                    {-1.6877, 3.3123},
+                                                    {-3.3123, 1.6877},
+                                                    {-5.94095, -0.940955}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+// Odd transform length.
+TEST_F(HloEvaluatorTest, 1D_IFFT_5_on_c64x5) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[5] parameter(0)
+  ROOT ifft = c64[5] fft(operand), fft_type=IFFT, fft_length={5}
+}
+)";
+  auto input = LiteralUtil::CreateR1<complex64>({{15.0, 15.0},
+                                                 {0.940955, 5.94095},
+                                                 {-1.6877, 3.3123},
+                                                 {-3.3123, 1.6877},
+                                                 {-5.94095, -0.940955}});
+  auto expected = LiteralUtil::CreateR1<complex64>(
+      {{1.0, 5.0}, {2.0, 4.0}, {3.0, 3.0}, {4.0, 2.0}, {5.0, 1.0}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+// All input values are zero.
+TEST_F(HloEvaluatorTest, 1D_FFT_4_on_zero_c64x4) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[4] parameter(0)
+  ROOT fft = c64[4] fft(operand), fft_type=FFT, fft_length={4}
+}
+)";
+  auto input = LiteralUtil::CreateR1<complex64>(
+      {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), input.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(input, result, fft_error_));
+}
+
+// All input values are zero.
+TEST_F(HloEvaluatorTest, 3D_FFT_3x3x4_on_zero_c64x3x3x4) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[3, 3, 4] parameter(0)
+  ROOT fft = c64[3, 3, 4] fft(operand), fft_type=FFT, fft_length={3, 3, 4}
+}
+)";
+  auto input = LiteralUtil::CreateR3<complex64>(
+      {{{{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}},
+       {{{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}},
+       {{{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), input.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(input, result, fft_error_));
+}
+
+// All input values are zero.
+TEST_F(HloEvaluatorTest, 3D_IFFT_3x3x4_on_zero_c64x3x3x4) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[3, 3, 4] parameter(0)
+  ROOT ifft = c64[3, 3, 4] fft(operand), fft_type=IFFT, fft_length={3, 3, 4}
+}
+)";
+  auto input = LiteralUtil::CreateR3<complex64>(
+      {{{{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}},
+       {{{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}},
+       {{{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), input.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(input, result, fft_error_));
+}
+
+// All input values are zero.
+TEST_F(HloEvaluatorTest, 3D_RFFT_3x3x4_on_zero_f32x3x3x4) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = f32[3, 3, 4] parameter(0)
+  ROOT rfft = c64[3, 3, 3] fft(operand), fft_type=RFFT, fft_length={3, 3, 4}
+}
+)";
+  auto input = LiteralUtil::CreateR3<float>(
+      {{{0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}},
+       {{0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}},
+       {{0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}}});
+  auto expected = LiteralUtil::CreateR3<complex64>(
+      {{{{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}},
+       {{{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}},
+       {{{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+// All input values are zero.
+TEST_F(HloEvaluatorTest, 3D_IRFFT_3x3x4_on_zero_c64x3x3x3) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[3, 3, 3] parameter(0)
+  ROOT irfft = f32[3, 3, 4] fft(operand), fft_type=IRFFT, fft_length={3, 3, 4}
+}
+)";
+  auto input = LiteralUtil::CreateR3<complex64>(
+      {{{{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}},
+       {{{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}},
+       {{{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}},
+        {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}}});
+  auto expected = LiteralUtil::CreateR3<float>(
+      {{{0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}},
+       {{0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}},
+       {{0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
+// Input values, for which IRFFT discards non-zero imaginary parts.
+TEST_F(HloEvaluatorTest, 2D_IRFFT_3x4_on_c64x3x3) {
+  const char* hlo_text = R"(
+HloModule Fft
+
+ENTRY main {
+  operand = c64[3, 3] parameter(0)
+  ROOT irfft = f32[3, 4] fft(operand), fft_type=IRFFT, fft_length={3, 4}
+}
+)";
+  auto input =
+      LiteralUtil::CreateR2<complex64>({{{0.0, 0.0}, {1.0, 0.0}, {2.0, 0.0}},
+                                        {{3.0, 0.0}, {4.0, 0.0}, {5.0, 0.0}},
+                                        {{6.0, 0.0}, {7.0, 0.0}, {8.0, 0.0}}});
+  auto expected =
+      LiteralUtil::CreateR2<float>({{4.0, -0.5, 0.0, -0.5},
+                                    {-1.5, 0.433013, 0.0, -0.433013},
+                                    {-1.5, -0.433013, 0.0, 0.433013}});
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&input}));
+  EXPECT_TRUE(ShapeUtil::Compatible(result.shape(), expected.shape()));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, result, fft_error_));
+}
+
 class HloEvaluatorPreciseReduceTest : public HloTestBase {};
 
 // Tests that Reduce doesn't lose precision when adding many numbers (because
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
index ab27ac82722..a2afb0c59eb 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
@@ -68,8 +68,8 @@ T ToArithmeticSafeType(T t) {
 // Templated DfsHloVisitor for use by HloEvaluator.
 //
 // Typically ReturnT here indicates the resulting literal type of each evaluated
-// Handle* method of a TypedVisitor.  There are however a few notable exceptions
-// to this rule, notably:
+// Handle* method of a TypedVisitor.  There are however a few exceptions to this
+// rule, notably:
 // - HandleCompare and HandleIsFinite: where the resulting literal type is
 //   always boolean.
 // - HandleImag and HandleReal: where the resulting literal type is always float
@@ -81,7 +81,7 @@ T ToArithmeticSafeType(T t) {
 //   - ReturnT: The type of input and output of each operation.
 //   - ElementwiseT: The type in which internal computation are done.
 //
-// This a logically a private part of HloEvaluator.  It lives in this header
+// This is logically a private part of HloEvaluator.  It lives in this header
 // file rather than in hlo_evaluator.cc because we use extern templates and a
 // bunch of independent cc files to speed up compiling the many instantiations
 // of this class.
@@ -180,7 +180,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
         parent_->GetEvaluatedLiteralFor(abs->operand(0));
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[abs],
-        (HloEvaluator::ElementWiseUnaryOpImpl<float, NativeT>(
+        (HloEvaluator::ElementWiseUnaryOpImpl<typename NativeT::value_type,
+                                              NativeT>(
             abs, [](NativeT elem_operand) { return std::abs(elem_operand); },
             operand_literal)));
 
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 62bac85b1ca..3a1ba773645 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -1040,6 +1040,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kCollectivePermute:
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
+    case HloOpcode::kPartitionId:
     case HloOpcode::kRecv:
     case HloOpcode::kRecvDone:
     case HloOpcode::kSend:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 2aaaf35a824..708cbf6ea29 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -416,6 +416,10 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       instruction = CreateReplicaId();
       break;
     }
+    case HloOpcode::kPartitionId: {
+      instruction = CreatePartitionId();
+      break;
+    }
     case HloOpcode::kConvolution: {
       TF_RET_CHECK(proto.has_window());
       TF_RET_CHECK(proto.has_convolution_dimension_numbers());
@@ -869,6 +873,12 @@ HloInstruction::CreateCollectivePermute(
       new HloInstruction(HloOpcode::kReplicaId, ShapeUtil::MakeShape(U32, {})));
 }
 
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreatePartitionId() {
+  return absl::WrapUnique(new HloInstruction(HloOpcode::kPartitionId,
+                                             ShapeUtil::MakeShape(U32, {})));
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateInfeed(
     const Shape& infeed_shape, HloInstruction* token_operand,
     const string& config) {
@@ -1506,6 +1516,10 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       CHECK_EQ(new_operands.size(), 0);
       clone = CreateReplicaId();
       break;
+    case HloOpcode::kPartitionId:
+      CHECK_EQ(new_operands.size(), 0);
+      clone = CreatePartitionId();
+      break;
   }
   // SetupDerivedInstruction will setup the precision_config_ field.
   SetupDerivedInstruction(clone.get());
@@ -1765,6 +1779,7 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kMinimum:
     case HloOpcode::kMultiply:
     case HloOpcode::kNegate:
+    case HloOpcode::kPartitionId:
     case HloOpcode::kPopulationCount:
     case HloOpcode::kPower:
     case HloOpcode::kReal:
@@ -2594,6 +2609,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleCollectivePermute(this);
     case HloOpcode::kReplicaId:
       return visitor->HandleReplicaId(this);
+    case HloOpcode::kPartitionId:
+      return visitor->HandlePartitionId(this);
     case HloOpcode::kTuple:
       return visitor->HandleTuple(this);
     case HloOpcode::kMap:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 8757da52442..23b556661fc 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -529,6 +529,9 @@ class HloInstruction {
   // Creates an instruction that returns a U32 replica ID.
   static std::unique_ptr<HloInstruction> CreateReplicaId();
 
+  // Creates an instruction that returns a U32 partition ID.
+  static std::unique_ptr<HloInstruction> CreatePartitionId();
+
   // Creates a conversion instruction, where operand is the data to convert and
   // shape is the target shape for the conversion.
   static std::unique_ptr<HloInstruction> CreateConvert(const Shape& shape,
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index 0adfef7d9e7..cf0f4bc912c 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -220,6 +220,7 @@ HLO_MATCHER(Multiply);
 HLO_MATCHER(Negate);
 HLO_MATCHER(Outfeed);
 HLO_MATCHER(Pad);
+HLO_MATCHER(PartitionId);
 HLO_MATCHER(Power);
 HLO_MATCHER(Recv);
 HLO_MATCHER(RecvDone);
@@ -227,6 +228,7 @@ HLO_MATCHER(Reduce);
 HLO_MATCHER(ReducePrecision);
 HLO_MATCHER(ReduceWindow);
 HLO_MATCHER(Remainder);
+HLO_MATCHER(ReplicaId);
 HLO_MATCHER(Reshape);
 HLO_MATCHER(Reverse);
 HLO_MATCHER(Rng);
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index 6d3a49898c2..ecd4eb3cbc0 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -108,6 +108,7 @@ namespace xla {
   V(kOutfeed, "outfeed", 2)                                            \
   V(kPad, "pad", 2)                                                    \
   V(kParameter, "parameter", 0)                                        \
+  V(kPartitionId, "partition-id", 0)                                   \
   V(kPopulationCount, "popcnt", 1)                                     \
   V(kPower, "power", 2)                                                \
   V(kReal, "real", 1)                                                  \
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index 3122b3eae41..3667fc381a8 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -891,6 +891,15 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       instruction = builder->AddInstruction(HloInstruction::CreateReplicaId());
       break;
     }
+    case HloOpcode::kPartitionId: {
+      if (!ParseOperands(&operands, /*expected_size=*/0) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction =
+          builder->AddInstruction(HloInstruction::CreatePartitionId());
+      break;
+    }
     case HloOpcode::kReshape: {
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
           !ParseAttributes(attrs)) {
@@ -4144,6 +4153,14 @@ bool HloParser::ParseSingleInstruction(HloModule* module) {
     }
   }
 
+  if (lexer_.GetKind() != TokKind::kEof) {
+    Error(
+        lexer_.GetLoc(),
+        "Syntax error:\nExpected eof after parsing single instruction.  Did "
+        "you mean to write an HLO module and forget the \"HloModule\" header?");
+    return false;
+  }
+
   module->AddEntryComputation(builder.Build());
   for (auto& comp : computations_) {
     module->AddEmbeddedComputation(std::move(comp));
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index 745715f127a..011850eb9ad 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -1436,6 +1436,17 @@ ENTRY Replica-id {
   ROOT replica-id = u32[] replica-id()
 }
 
+)"
+},
+// partition-id
+{
+"PartitionId",
+R"(HloModule partition-id
+
+ENTRY PartitionId {
+  ROOT id = u32[] partition-id()
+}
+
 )"
 },
 // Iota
@@ -2491,6 +2502,16 @@ TEST(HloParserSingleOpTest, ConvolutionTrivialFeatureGroupCount) {
   EXPECT_EQ(convolution->feature_group_count(), 1);
 }
 
+TEST(HloParserSingleOpTest, MultipleOpsProducesError) {
+  const string text = R"(
+    param = f32[2,5,1,3] parameter(0)
+    transpose = f32[1,5,2,3] transpose(param), dimensions={2,1,0,3}
+  )";
+  auto status = ParseHloString(text).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(), ::testing::HasSubstr("Expected eof"));
+}
+
 TEST_F(HloParserTest, IsScheduledIsFalse) {
   const string text = R"(
 HloModule axpy_module, is_scheduled=false
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 73c1dde8803..6cbfb784cdc 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -202,6 +202,10 @@ Status ShapeVerifier::HandleAllToAll(HloInstruction* hlo) {
                     ShapeInference::InferAllToAllTupleShape(operand_shapes));
 }
 
+Status ShapeVerifier::HandlePartitionId(HloInstruction* hlo) {
+  return CheckShape(hlo, ShapeUtil::MakeShape(U32, {}));
+}
+
 Status ShapeVerifier::HandleReplicaId(HloInstruction* hlo) {
   return CheckShape(hlo, ShapeUtil::MakeShape(U32, {}));
 }
@@ -983,7 +987,7 @@ Status ShapeVerifier::VerifyEntryComputationLayout(const HloModule& module) {
   if (computation->num_parameters() != layout.parameter_count()) {
     return InternalError(
         "Number of parameters in entry computation layout (%d) must be same "
-        "as number of parameters of entry computation computation (%d)",
+        "as number of parameters of entry computation (%d)",
         layout.parameter_count(), computation->num_parameters());
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index a38ec5a05d4..45e472bbdf2 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -57,6 +57,7 @@ class ShapeVerifier : public DfsHloVisitor {
   Status HandleAllReduce(HloInstruction* crs) override;
   Status HandleAllToAll(HloInstruction* hlo) override;
   Status HandleCollectivePermute(HloInstruction* hlo) override;
+  Status HandlePartitionId(HloInstruction* hlo) override;
   Status HandleReplicaId(HloInstruction* hlo) override;
   Status HandleReducePrecision(HloInstruction* reduce_precision) override;
   Status HandleInfeed(HloInstruction*) override;
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index 90ac234e737..f12b725be30 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -88,6 +88,7 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kXor:
     case HloOpcode::kOutfeed:
     case HloOpcode::kPad:
+    case HloOpcode::kPartitionId:
     case HloOpcode::kPopulationCount:
     case HloOpcode::kReal:
     case HloOpcode::kReducePrecision:
diff --git a/tensorflow/compiler/xla/service/interpreter/BUILD b/tensorflow/compiler/xla/service/interpreter/BUILD
index 599489b3785..7f0c1ccc728 100644
--- a/tensorflow/compiler/xla/service/interpreter/BUILD
+++ b/tensorflow/compiler/xla/service/interpreter/BUILD
@@ -35,6 +35,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:cholesky_expander",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:computation_placer",
+        "//tensorflow/compiler/xla/service:custom_call_target_registry",
         "//tensorflow/compiler/xla/service:dynamic_index_splitter",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
@@ -53,7 +54,6 @@ cc_library(
         "//tensorflow/compiler/xla/service:reshape_mover",
         "//tensorflow/compiler/xla/service:triangular_solve_expander",
         "//tensorflow/compiler/xla/service:while_loop_simplifier",
-        "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
         "//tensorflow/core:lib",
         "//tensorflow/stream_executor",
         "@com_google_absl//absl/memory",
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc
index a8f8ab4f725..80a3ebccff1 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
 #include "tensorflow/compiler/xla/service/cholesky_expander.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
-#include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
+#include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
@@ -52,8 +52,8 @@ namespace {
 StatusOr<Literal> HandleEvaluatorCustomCall(
     HloInstruction* custom_call, absl::Span<const Literal*> operands) {
   // Find the target C function in the global registry.
-  auto* registry = xla::cpu::CustomCallTargetRegistry::Global();
-  void* target_fn = registry->Lookup(custom_call->custom_call_target());
+  auto* registry = CustomCallTargetRegistry::Global();
+  void* target_fn = registry->Lookup(custom_call->custom_call_target(), "Host");
   if (!target_fn) {
     return NotFound("Custom call target '%s' was not registered",
                     custom_call->custom_call_target());
@@ -96,7 +96,7 @@ Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
 
 StatusOr<std::unique_ptr<HloModule>> InterpreterCompiler::RunHloPasses(
     std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* /*stream_exec*/,
-    DeviceMemoryAllocator* /*device_allocator*/) {
+    se::DeviceMemoryAllocator* /*device_allocator*/) {
   VLOG(1) << "Run hlo passes on graph " << hlo_module->name();
   TF_RETURN_IF_ERROR(RunHloOptimization(hlo_module.get()));
   return std::move(hlo_module);
@@ -105,13 +105,13 @@ StatusOr<std::unique_ptr<HloModule>> InterpreterCompiler::RunHloPasses(
 Status InterpreterCompiler::RunHloPassesOnModuleGroup(
     HloModuleGroup* module_group,
     absl::Span<se::StreamExecutor* const> executors,
-    DeviceMemoryAllocator* device_allocator) {
+    se::DeviceMemoryAllocator* device_allocator) {
   return Unimplemented("Module group compilation not supported on Interpreter");
 }
 
 StatusOr<std::unique_ptr<Executable>> InterpreterCompiler::RunBackend(
     std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
-    DeviceMemoryAllocator* /*device_allocator*/) {
+    se::DeviceMemoryAllocator* /*device_allocator*/) {
   TF_RET_CHECK(stream_exec != nullptr);
 
   VLOG(1) << "Run backend " << hlo_module->name();
@@ -137,7 +137,7 @@ StatusOr<std::vector<std::unique_ptr<Executable>>>
 InterpreterCompiler::RunBackendOnModuleGroup(
     std::unique_ptr<HloModuleGroup> module_group,
     std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-    DeviceMemoryAllocator* device_allocator) {
+    se::DeviceMemoryAllocator* device_allocator) {
   return Unimplemented(
       "Module group compilation is not supported on Interpreter.");
 }
@@ -145,7 +145,7 @@ InterpreterCompiler::RunBackendOnModuleGroup(
 StatusOr<std::vector<std::unique_ptr<Executable>>> InterpreterCompiler::Compile(
     std::unique_ptr<HloModuleGroup> module_group,
     std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-    DeviceMemoryAllocator* device_allocator) {
+    se::DeviceMemoryAllocator* device_allocator) {
   if (module_group->empty()) {
     return std::vector<std::unique_ptr<Executable>>();
   }
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.h b/tensorflow/compiler/xla/service/interpreter/compiler.h
index 591272951a0..dc83295b527 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.h
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.h
@@ -45,24 +45,24 @@ class InterpreterCompiler : public Compiler {
 
   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;
   Status RunHloPassesOnModuleGroup(
       HloModuleGroup* module_group,
       absl::Span<se::StreamExecutor* const> executors,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;
   StatusOr<std::vector<std::unique_ptr<Executable>>> RunBackendOnModuleGroup(
       std::unique_ptr<HloModuleGroup> module_group,
       std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::unique_ptr<HloModuleGroup> module_group,
       std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 8c665823353..b1303f17580 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -2105,6 +2105,7 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kIota:
     case HloOpcode::kOutfeed:
     case HloOpcode::kParameter:
+    case HloOpcode::kPartitionId:
     case HloOpcode::kRecv:
     case HloOpcode::kRecvDone:
     case HloOpcode::kReduce:
diff --git a/tensorflow/compiler/xla/service/llvm_compiler.cc b/tensorflow/compiler/xla/service/llvm_compiler.cc
index 382b5751202..82e955c818e 100644
--- a/tensorflow/compiler/xla/service/llvm_compiler.cc
+++ b/tensorflow/compiler/xla/service/llvm_compiler.cc
@@ -24,7 +24,7 @@ namespace xla {
 Status LLVMCompiler::RunHloPassesOnModuleGroup(
     HloModuleGroup* module_group,
     absl::Span<se::StreamExecutor* const> executors,
-    DeviceMemoryAllocator* device_allocator) {
+    se::DeviceMemoryAllocator* device_allocator) {
   return Unimplemented(
       "Model partitioning not implemented for the CPU/GPU compilers!");
 }
@@ -33,7 +33,7 @@ StatusOr<std::vector<std::unique_ptr<Executable>>>
 LLVMCompiler::RunBackendOnModuleGroup(
     std::unique_ptr<HloModuleGroup> module_group,
     std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-    DeviceMemoryAllocator* device_allocator) {
+    se::DeviceMemoryAllocator* device_allocator) {
   return Unimplemented(
       "Model partitioning not implemented for the CPU/GPU compilers!");
 }
@@ -41,7 +41,7 @@ LLVMCompiler::RunBackendOnModuleGroup(
 StatusOr<std::vector<std::unique_ptr<Executable>>> LLVMCompiler::Compile(
     std::unique_ptr<HloModuleGroup> module_group,
     std::vector<std::vector<se::StreamExecutor*>> stream_execs,
-    DeviceMemoryAllocator* device_allocator) {
+    se::DeviceMemoryAllocator* device_allocator) {
   // Tensorflow tries to enable the following behaviors in all its threads:
   //
   //  - Denormals are zero (DAZ): roughly, operations treat denormal floats as
diff --git a/tensorflow/compiler/xla/service/llvm_compiler.h b/tensorflow/compiler/xla/service/llvm_compiler.h
index afd9f370383..888815bea3d 100644
--- a/tensorflow/compiler/xla/service/llvm_compiler.h
+++ b/tensorflow/compiler/xla/service/llvm_compiler.h
@@ -61,28 +61,28 @@ class LLVMCompiler : public Compiler {
   //   StatusOr<std::unique_ptr<Executable>> RunBackend(
   //       std::unique_ptr<HloModule> module,
   //       se::StreamExecutor* stream_exec,
-  //       DeviceMemoryAllocator* device_allocator)
+  //       se::DeviceMemoryAllocator* device_allocator)
   //   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
   //       std::unique_ptr<HloModule> module,
   //       se::StreamExecutor* stream_exec,
-  //       DeviceMemoryAllocator* device_allocator)
+  //       se::DeviceMemoryAllocator* device_allocator)
   using Compiler::RunBackend;
   using Compiler::RunHloPasses;
 
   Status RunHloPassesOnModuleGroup(
       HloModuleGroup* module_group,
       absl::Span<se::StreamExecutor* const> executors,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> RunBackendOnModuleGroup(
       std::unique_ptr<HloModuleGroup> module_group,
       std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::unique_ptr<HloModuleGroup> module_group,
       std::vector<std::vector<se::StreamExecutor*>> stream_execs,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;
 
  protected:
   ModuleHook user_pre_optimization_hook_;
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index ca85dd7647e..e1303f60779 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -49,8 +49,8 @@ tf_cc_test(
     srcs = ["alias_analysis_test.cc"],
     deps = [
         ":alias_analysis",
+        "//tensorflow/compiler/xla/service:custom_call_target_registry",
         "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
         "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
         "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/core:test",
diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc
index db900856993..db60e08472d 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc
@@ -13,12 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h"
+
 #include <memory>
 #include <utility>
 
-#include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
-#include "tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h"
+#include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/tests/filecheck.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -29,7 +30,7 @@ class AliasAnalysisTest : public CpuCodegenTest {};
 
 void FakeCustomCallTarget(float* out, float** in) {}
 
-REGISTER_CUSTOM_CALL_TARGET(FakeCustomCallTarget);
+XLA_CPU_REGISTER_CUSTOM_CALL_TARGET(FakeCustomCallTarget);
 
 TEST_F(AliasAnalysisTest, EmbeddedComputationParamsMayAliasTemps) {
   const char* hlo_string = R"(
diff --git a/tensorflow/compiler/xla/service/local_service.h b/tensorflow/compiler/xla/service/local_service.h
index f56ba32b04b..170d226e336 100644
--- a/tensorflow/compiler/xla/service/local_service.h
+++ b/tensorflow/compiler/xla/service/local_service.h
@@ -23,13 +23,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/service.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc b/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc
index 8269842426e..1642c50d225 100644
--- a/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc
+++ b/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc
@@ -17,25 +17,29 @@ limitations under the License.
 #include "absl/types/variant.h"
 namespace xla {
 
-se::DeviceMemoryBase MaybeOwningDeviceMemory::AsDeviceMemoryBase() {
+tensorflow::se::DeviceMemoryBase MaybeOwningDeviceMemory::AsDeviceMemoryBase() {
   if (HasOwnership()) {
-    return absl::get<OwningDeviceMemory>(mem_).AsDeviceMemoryBase();
+    return absl::get<tensorflow::se::OwningDeviceMemory>(mem_)
+        .AsDeviceMemoryBase();
   } else {
-    return absl::get<se::DeviceMemoryBase>(mem_);
+    return absl::get<tensorflow::se::DeviceMemoryBase>(mem_);
   }
 }
 
 bool MaybeOwningDeviceMemory::HasOwnership() const {
-  return absl::holds_alternative<OwningDeviceMemory>(mem_);
+  return absl::holds_alternative<tensorflow::se::OwningDeviceMemory>(mem_);
 }
 
-absl::optional<OwningDeviceMemory> MaybeOwningDeviceMemory::Release() {
+absl::optional<tensorflow::se::OwningDeviceMemory>
+MaybeOwningDeviceMemory::Release() {
   if (!HasOwnership()) {
     return {};
   }
-  OwningDeviceMemory result = std::move(absl::get<OwningDeviceMemory>(mem_));
+  tensorflow::se::OwningDeviceMemory result =
+      std::move(absl::get<tensorflow::se::OwningDeviceMemory>(mem_));
   mem_ = result.AsDeviceMemoryBase();
-  return absl::make_optional<OwningDeviceMemory>(std::move(result));
+  return absl::make_optional<tensorflow::se::OwningDeviceMemory>(
+      std::move(result));
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/maybe_owning_device_memory.h b/tensorflow/compiler/xla/service/maybe_owning_device_memory.h
index 82e7f1183c0..e4c3196640e 100644
--- a/tensorflow/compiler/xla/service/maybe_owning_device_memory.h
+++ b/tensorflow/compiler/xla/service/maybe_owning_device_memory.h
@@ -18,30 +18,30 @@ limitations under the License.
 
 #include "absl/types/optional.h"
 #include "absl/types/variant.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
-#include "tensorflow/compiler/xla/service/owning_device_memory.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
+#include "tensorflow/stream_executor/owning_device_memory.h"
 
 namespace xla {
 
 // MaybeOwningDeviceMemory represents either an owned or unowned device memory.
-// Like std::variant<OwningDeviceMemory, DeviceMemory>. When the object goes
+// Like std::variant<se::OwningDeviceMemory, DeviceMemory>. When the object goes
 // output of scope, it will free the underlying memory if it owns it.
 class MaybeOwningDeviceMemory {
  public:
   MaybeOwningDeviceMemory() = default;
-  explicit MaybeOwningDeviceMemory(OwningDeviceMemory owned)
+  explicit MaybeOwningDeviceMemory(tensorflow::se::OwningDeviceMemory owned)
       : mem_(std::move(owned)) {}
-  explicit MaybeOwningDeviceMemory(se::DeviceMemoryBase unowned)
+  explicit MaybeOwningDeviceMemory(tensorflow::se::DeviceMemoryBase unowned)
       : mem_(unowned) {}
   MaybeOwningDeviceMemory(MaybeOwningDeviceMemory&&) = default;
   ~MaybeOwningDeviceMemory() = default;
 
-  MaybeOwningDeviceMemory& operator=(se::DeviceMemoryBase unowned) {
+  MaybeOwningDeviceMemory& operator=(tensorflow::se::DeviceMemoryBase unowned) {
     mem_ = unowned;
     return *this;
   }
 
-  MaybeOwningDeviceMemory& operator=(OwningDeviceMemory owned) {
+  MaybeOwningDeviceMemory& operator=(tensorflow::se::OwningDeviceMemory owned) {
     mem_ = std::move(owned);
     return *this;
   }
@@ -50,19 +50,21 @@ class MaybeOwningDeviceMemory {
 
   // Fetches the underlying DeviceMemoryBase from a MaybeOwningDeviceMemory. The
   // caller of this function is *not* responsible for freeing the memory.
-  se::DeviceMemoryBase AsDeviceMemoryBase();
+  tensorflow::se::DeviceMemoryBase AsDeviceMemoryBase();
 
-  // Release the OwningDeviceMemory without freeing it, and moves the ownership
-  // of the memory buffer from the object to the caller.
+  // Release the tensorflow::se::OwningDeviceMemory without freeing it, and
+  // moves the ownership of the memory buffer from the object to the caller.
   //
   // A nullopt is returned if the HasOwnership() == false;
-  absl::optional<OwningDeviceMemory> Release();
+  absl::optional<tensorflow::se::OwningDeviceMemory> Release();
 
   // Returns true if the device_memory has ownership over underlying memory.
   bool HasOwnership() const;
 
  private:
-  absl::variant<OwningDeviceMemory, se::DeviceMemoryBase> mem_;
+  absl::variant<tensorflow::se::OwningDeviceMemory,
+                tensorflow::se::DeviceMemoryBase>
+      mem_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 49c346d87fc..42b9e566d71 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
 #include "tensorflow/compiler/xla/service/executable.h"
@@ -58,6 +57,7 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/ptr_util.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 namespace {
@@ -347,7 +347,7 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
     const std::vector<const HloModuleProto*>& module_protos,
     std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
     Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
-    DeviceMemoryAllocator* device_allocator) {
+    se::DeviceMemoryAllocator* device_allocator) {
   VLOG(1) << StrFormat("BuildExecutable on service %p", this);
 
   // Dump computation proto state if flag is set.
@@ -783,7 +783,7 @@ Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg,
 StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
     const HloModuleProto& module_proto,
     std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
-    se::StreamExecutor* executor, DeviceMemoryAllocator* device_allocator) {
+    se::StreamExecutor* executor, se::DeviceMemoryAllocator* device_allocator) {
   VLOG(1) << StrFormat(
       "BuildExecutable on service %p with serialized module proto: %s", this,
       module_proto.name());
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index f127e340b59..ba51e457c20 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/channel_tracker.h"
 #include "tensorflow/compiler/xla/service/compilation_cache.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/execution_tracker.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
@@ -43,6 +42,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 
@@ -234,7 +234,7 @@ class Service : public ServiceInterface {
       const HloModuleProto& module_proto,
       std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
       se::StreamExecutor* executor,
-      DeviceMemoryAllocator* device_allocator = nullptr);
+      se::DeviceMemoryAllocator* device_allocator = nullptr);
 
   // Same as BuildExecutable() above, but builds a list of Executables for the
   // given computations that may interact with each other.
@@ -242,7 +242,7 @@ class Service : public ServiceInterface {
       const std::vector<const HloModuleProto*>& module_protos,
       std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
       Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
-      DeviceMemoryAllocator* device_allocator);
+      se::DeviceMemoryAllocator* device_allocator);
 
   // Runs the given executable with the given arguments and register the result
   // in the allocation tracker. The handle of the result from the tracker is
diff --git a/tensorflow/compiler/xla/service/service_executable_run_options.h b/tensorflow/compiler/xla/service/service_executable_run_options.h
index 6bee6710565..7fc66310ee7 100644
--- a/tensorflow/compiler/xla/service/service_executable_run_options.h
+++ b/tensorflow/compiler/xla/service/service_executable_run_options.h
@@ -43,7 +43,9 @@ class ServiceExecutableRunOptions {
 
   // Delegate to `ExecutableRunOptions` member.
   se::Stream* stream() const { return run_options_.stream(); }
-  DeviceMemoryAllocator* allocator() const { return run_options_.allocator(); }
+  se::DeviceMemoryAllocator* allocator() const {
+    return run_options_.allocator();
+  }
   int device_ordinal() const { return run_options_.device_ordinal(); }
 
   // Borrows a stream and returns a smart pointer which returns the stream on
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc
index 69d34583d9e..9b0ec31e9da 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer.cc
@@ -119,14 +119,14 @@ std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer) {
 
 ScopedShapedBuffer::ScopedShapedBuffer(const Shape& on_host_shape,
                                        const Shape& on_device_shape,
-                                       DeviceMemoryAllocator* allocator,
+                                       se::DeviceMemoryAllocator* allocator,
                                        int device_ordinal)
     : ShapedBuffer(on_host_shape, on_device_shape, allocator->platform(),
                    device_ordinal),
       allocator_(allocator) {}
 
 ScopedShapedBuffer::ScopedShapedBuffer(ShapedBuffer shaped_buffer,
-                                       DeviceMemoryAllocator* allocator)
+                                       se::DeviceMemoryAllocator* allocator)
     : ShapedBuffer(std::move(shaped_buffer)), allocator_(allocator) {}
 
 ScopedShapedBuffer::ScopedShapedBuffer(ScopedShapedBuffer&& s)
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h
index 619b6ccd1cc..39346540d8d 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.h
+++ b/tensorflow/compiler/xla/service/shaped_buffer.h
@@ -21,12 +21,12 @@ limitations under the License.
 #include <string>
 
 #include "absl/types/span.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 
@@ -138,13 +138,13 @@ class ScopedShapedBuffer : public ShapedBuffer {
   // Creates a ScopedShapedBuffer with null DeviceMemoryBases at each index.
   explicit ScopedShapedBuffer(const Shape& on_host_shape,
                               const Shape& on_device_shape,
-                              DeviceMemoryAllocator* allocator,
+                              se::DeviceMemoryAllocator* allocator,
                               int device_ordinal);
 
   // Create a ScopedShapedBuffer by taking over the memory from the incoming
   // ShapedBuffer.
   explicit ScopedShapedBuffer(ShapedBuffer shaped_buffer,
-                              DeviceMemoryAllocator* allocator);
+                              se::DeviceMemoryAllocator* allocator);
 
   // Movable, but not copyable.
   ScopedShapedBuffer(ScopedShapedBuffer&& s);
@@ -157,13 +157,13 @@ class ScopedShapedBuffer : public ShapedBuffer {
 
   // Return the allocator used to allocate the device memory held in this
   // ScopedShapedBuffer.
-  DeviceMemoryAllocator* memory_allocator() const { return allocator_; }
+  se::DeviceMemoryAllocator* memory_allocator() const { return allocator_; }
 
   // Sets the device memory buffer at the given index.
   //
   // If the given buffer's device memory is non-null, its device_ordinal and
   // allocator must match those in `this`.
-  void set_buffer(OwningDeviceMemory buffer, const ShapeIndex& index) {
+  void set_buffer(se::OwningDeviceMemory buffer, const ShapeIndex& index) {
     if (!buffer.is_null()) {
       CHECK_EQ(buffer.device_ordinal(), device_ordinal());
       CHECK_EQ(buffer.allocator(), allocator_);
@@ -187,7 +187,7 @@ class ScopedShapedBuffer : public ShapedBuffer {
  protected:
   void Deallocate();
 
-  DeviceMemoryAllocator* allocator_;
+  se::DeviceMemoryAllocator* allocator_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/shaped_buffer_test.cc b/tensorflow/compiler/xla/service/shaped_buffer_test.cc
index 3f0042e4bcb..3885c5f3759 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer_test.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer_test.cc
@@ -16,13 +16,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 
 #include "absl/memory/memory.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/util/ptr_util.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 namespace {
@@ -34,7 +34,7 @@ TEST(ShapedBufferTest, ScopedShapeBufferAsShapedBufferB71629047) {
   auto* platform = platforms[0];
   TF_ASSERT_OK_AND_ASSIGN(auto executors,
                           xla::PlatformUtil::GetStreamExecutors(platform));
-  xla::StreamExecutorMemoryAllocator allocator(platform, executors);
+  xla::se::StreamExecutorMemoryAllocator allocator(platform, executors);
   const xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {});
   const int kDeviceOrdinal = 0;
   auto scoped_buffer = absl::make_unique<xla::ScopedShapedBuffer>(
@@ -43,11 +43,11 @@ TEST(ShapedBufferTest, ScopedShapeBufferAsShapedBufferB71629047) {
   buffer = nullptr;
 }
 
-class TestAllocator : public DeviceMemoryAllocator {
+class TestAllocator : public se::DeviceMemoryAllocator {
  public:
   TestAllocator()
-      : DeviceMemoryAllocator(PlatformUtil::GetDefaultPlatform().ValueOrDie()) {
-  }
+      : se::DeviceMemoryAllocator(
+            PlatformUtil::GetDefaultPlatform().ValueOrDie()) {}
 
   ~TestAllocator() override {
     if (!allocations_.empty()) {
@@ -56,18 +56,18 @@ class TestAllocator : public DeviceMemoryAllocator {
   }
 
   // Pull in two-arg overload of Allocate.
-  using DeviceMemoryAllocator::Allocate;
+  using se::DeviceMemoryAllocator::Allocate;
 
-  StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
-                                        bool /*retry_on_failure*/) override {
+  StatusOr<se::OwningDeviceMemory> Allocate(
+      int device_ordinal, uint64 size, bool /*retry_on_failure*/) override {
     // By contract, we must return null if size == 0.
     if (size == 0) {
-      return OwningDeviceMemory();
+      return se::OwningDeviceMemory();
     }
     void* buf = malloc(size);
     allocations_.insert({device_ordinal, buf});
-    return OwningDeviceMemory(se::DeviceMemoryBase(buf, size), device_ordinal,
-                              this);
+    return se::OwningDeviceMemory(se::DeviceMemoryBase(buf, size),
+                                  device_ordinal, this);
   }
 
   Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override {
@@ -120,7 +120,7 @@ TEST(ScopedShapedBufferTest, TestTakeSubTree) {
   sb.buffers().ForEachMutableElement(
       [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
         TF_ASSERT_OK_AND_ASSIGN(
-            OwningDeviceMemory m,
+            se::OwningDeviceMemory m,
             allocator.Allocate(/*device_ordinal=*/0, /*size=*/77));
         *buffer = m.Forget();
       });
@@ -158,7 +158,7 @@ TEST(ScopedShapedBufferTest, TestSubShapeTree) {
   sb.buffers().ForEachMutableElement(
       [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
         TF_ASSERT_OK_AND_ASSIGN(
-            OwningDeviceMemory m,
+            se::OwningDeviceMemory m,
             allocator.Allocate(/*device_ordinal=*/0, /*size=*/32));
         *buffer = m.Forget();
       });
diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc
index b93ce99ca18..6474edf2701 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/transfer_manager.cc
@@ -308,7 +308,7 @@ Status TransferManager::TransferBufferToDevice(
 }
 
 StatusOr<ScopedShapedBuffer> TransferManager::AllocateScopedShapedBuffer(
-    const Shape& on_host_shape, DeviceMemoryAllocator* allocator,
+    const Shape& on_host_shape, se::DeviceMemoryAllocator* allocator,
     int device_ordinal) {
   if (!LayoutUtil::HasLayout(on_host_shape)) {
     return InvalidArgument("Shape must have a layout: %s",
diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h
index 17a0a3c17f4..f08862bff26 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@@ -229,7 +229,7 @@ class TransferManager {
   // shape. The on-device shape may be different as indicated by
   // HostShapeToDeviceShape.
   StatusOr<ScopedShapedBuffer> AllocateScopedShapedBuffer(
-      const Shape& on_host_shape, DeviceMemoryAllocator* allocator,
+      const Shape& on_host_shape, se::DeviceMemoryAllocator* allocator,
       int device_ordinal);
 
   // The given ShapedBuffer holds a handle to allocated memory, but it is not
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index eee18c9ed51..2eb9d278bd9 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -1331,8 +1331,7 @@ ShapeUtil::ReshapeLeavesDimensionsUnmodified(
       }
     }
     Shape output_shape_with_layout = output_shape;
-    *output_shape_with_layout.mutable_layout()->mutable_minor_to_major() =
-        layout;
+    *output_shape_with_layout.mutable_layout() = Layout{layout};
     return output_shape_with_layout;
   }
 
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index c60ae52fef2..cff87c59938 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -55,6 +55,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:types",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -259,7 +260,6 @@ cc_library(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:computation_placer",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/service:shaped_buffer",
@@ -268,6 +268,7 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "//third_party/eigen3",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:span",
@@ -1172,7 +1173,6 @@ xla_test(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/service:computation_placer",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/service:shaped_buffer",
@@ -1183,6 +1183,7 @@ xla_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
+        "//tensorflow/stream_executor:device_memory_allocator",
     ],
 )
 
@@ -1418,8 +1419,8 @@ xla_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/service:custom_call_target_registry",
         "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -2078,7 +2079,6 @@ xla_test(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/service:shaped_buffer",
@@ -2090,6 +2090,7 @@ xla_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
+        "//tensorflow/stream_executor:device_memory_allocator",
     ],
 )
 
@@ -2206,13 +2207,13 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:generic_transfer_manager",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/service:stream_pool",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
+        "//tensorflow/stream_executor:device_memory_allocator",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/tests/build_defs.bzl b/tensorflow/compiler/xla/tests/build_defs.bzl
index 08cb39d129d..48719c6c47c 100644
--- a/tensorflow/compiler/xla/tests/build_defs.bzl
+++ b/tensorflow/compiler/xla/tests/build_defs.bzl
@@ -265,6 +265,8 @@ def generate_backend_test_macros(backends = []):
                 "-DXLA_DISABLED_MANIFEST=\\\"%s\\\"" % manifest,
             ],
             deps = [
+                "@com_google_absl//absl/container:flat_hash_map",
+                "@com_google_absl//absl/strings",
                 "//tensorflow/compiler/xla:types",
                 "//tensorflow/core:lib",
                 "//tensorflow/core:regexp_internal",
diff --git a/tensorflow/compiler/xla/tests/custom_call_test.cc b/tensorflow/compiler/xla/tests/custom_call_test.cc
index 4687ed61a7d..63c3b4b5b02 100644
--- a/tensorflow/compiler/xla/tests/custom_call_test.cc
+++ b/tensorflow/compiler/xla/tests/custom_call_test.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
+#include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -64,10 +64,10 @@ void F32TupleSwap(float** out, float** in) {
 
 }  // namespace
 
-REGISTER_CUSTOM_CALL_TARGET(R0F32Add2);
-REGISTER_CUSTOM_CALL_TARGET(R2F32ReduceSum);
-REGISTER_CUSTOM_CALL_TARGET(Add1ToValues);
-REGISTER_CUSTOM_CALL_TARGET(F32TupleSwap);
+XLA_CPU_REGISTER_CUSTOM_CALL_TARGET(R0F32Add2);
+XLA_CPU_REGISTER_CUSTOM_CALL_TARGET(R2F32ReduceSum);
+XLA_CPU_REGISTER_CUSTOM_CALL_TARGET(Add1ToValues);
+XLA_CPU_REGISTER_CUSTOM_CALL_TARGET(F32TupleSwap);
 
 namespace xla {
 namespace {
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index 587db49957b..59c3d4f5c7e 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -1521,7 +1521,7 @@ void DOT_ReorderContracting(int num_iters) {
 
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
   auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
-  StreamExecutorMemoryAllocator allocator(platform, executors);
+  se::StreamExecutorMemoryAllocator allocator(platform, executors);
 
   xla::LocalClientOptions client_options;
   client_options.set_platform(platform);
diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
index 82e2db36143..1ea72af5f5f 100644
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/reference_util.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
@@ -34,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 namespace {
@@ -736,7 +736,7 @@ void BM_DynamicSlice(int num_iters) {
 
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
   auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
-  StreamExecutorMemoryAllocator allocator(platform, executors);
+  se::StreamExecutorMemoryAllocator allocator(platform, executors);
   LocalClient* client =
       ClientLibrary::GetOrCreateLocalClient(platform).ValueOrDie();
   auto* transfer_manager =
diff --git a/tensorflow/compiler/xla/tests/exhaustive_op_test.cc b/tensorflow/compiler/xla/tests/exhaustive_op_test.cc
index 7f35a61ba33..7df01e04c6d 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_op_test.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive_op_test.cc
@@ -245,14 +245,6 @@ class ExhaustiveOpTest
     int64 begin, end;
     std::tie(begin, end) = test_range;
 
-    if (begin >= known_incorrect_begin_ && end <= known_incorrect_end_) {
-      LOG(INFO) << absl::StreamFormat(
-          "Skipping this shard, as the range under test, [%d, %d), falls "
-          "entirely within the known-incorrect range [%d, %d).",
-          begin, end, known_incorrect_begin_, known_incorrect_end_);
-      return;
-    }
-
     LOG(INFO) << "Checking range [" << begin << ", " << end << ")";
 
     int64 input_size = end - begin;
@@ -262,8 +254,7 @@ class ExhaustiveOpTest
       IntegralT input_val = i + begin;
       // If the operation is known to be buggy on a specific input clamp that
       // input to 0 under the assumption that the op is at least correct on 0.
-      if (input_val >= known_incorrect_begin_ &&
-          input_val < known_incorrect_end_) {
+      if (known_incorrect_fn_ && known_incorrect_fn_(input_val)) {
         input_arr[i] = T{0};
       } else {
         input_arr[i] = absl::bit_cast<T>(input_val);
@@ -347,6 +338,10 @@ class ExhaustiveOpTest
     // denormals.
     const T expected_at_pos_zero = static_cast<T>(evaluate_op(0));
     const T expected_at_neg_zero = static_cast<T>(evaluate_op(-0.0));
+    const T expected_at_pos_min_normal_float =
+        static_cast<T>(evaluate_op(std::numeric_limits<float>::min()));
+    const T expected_at_neg_min_normal_float =
+        static_cast<T>(evaluate_op(-std::numeric_limits<float>::min()));
     for (int64 i = 0; i < input_arr.size(); ++i) {
       T input = input_arr[i];
       float input_f32 = static_cast<float>(input);
@@ -378,13 +373,23 @@ class ExhaustiveOpTest
       //   - evaluate_op(input)
       //   - evaluate_op(+/-0), where the sign of 0 equal to the sign of
       //     `input`,
+      //   - evaluate_op(+/-min_normal_float), where the sign of
+      //     min_normal_float matches `input`.
       //   - if relaxed_denormal_signs_, evaluate_op(-/+0), where the sign of
       //     0 is the opposite of `input`.
+      //
+      // (In particular, the XLA:CPU implementation of log flushes positive
+      // denormals to min-normal-float.  This seems kind of reasonable if our
+      // goal is to avoid infinities because they cause nans?)
       T sign_preserving_ftz_expected =
           std::signbit(input_f32) ? expected_at_neg_zero : expected_at_pos_zero;
+      T flush_to_normal_expected = std::signbit(input_f32)
+                                       ? expected_at_neg_min_normal_float
+                                       : expected_at_pos_min_normal_float;
       T sign_nonpreserving_ftz_expected =
           std::signbit(input_f32) ? expected_at_pos_zero : expected_at_neg_zero;
       if (IsClose(sign_preserving_ftz_expected, actual) ||
+          IsClose(flush_to_normal_expected, actual) ||
           (relaxed_denormal_signs_ &&
            IsClose(sign_nonpreserving_ftz_expected, actual))) {
         continue;
@@ -395,11 +400,13 @@ class ExhaustiveOpTest
           return absl::StrFormat(
               "Mismatch on denormal value %s.  Expected one of:\n"
               "  %10s (evaluated at full-precision value)\n"
+              "  %10s (evaluated at sign-preserving min-normal-float)\n"
               "  %10s (evaluated after flushing to sign-preserving zero)\n"
               "  %10s (evaluated after flushing to non-sign-preserving "
               "zero)\n"
               "but got %s.",
-              StringifyNum(input), StringifyNum(expected),
+              StringifyNum(input),  //
+              StringifyNum(expected), StringifyNum(flush_to_normal_expected),
               StringifyNum(sign_preserving_ftz_expected),
               StringifyNum(sign_nonpreserving_ftz_expected),
               StringifyNum(actual));
@@ -409,10 +416,13 @@ class ExhaustiveOpTest
           return absl::StrFormat(
               "Mismatch on denormal value %s.  Expected one of:\n"
               "  %10s (evaluated at full-precision value)\n"
+              "  %10s (evaluated at sign-preserving min-normal-float)\n"
               "  %10s (evaluated after flushing to sign-preserving zero)\n"
               "but got %s.",
-              StringifyNum(input), StringifyNum(expected),
-              StringifyNum(sign_preserving_ftz_expected), StringifyNum(actual));
+              StringifyNum(input),  //
+              StringifyNum(expected), StringifyNum(flush_to_normal_expected),
+              StringifyNum(sign_preserving_ftz_expected),  //
+              StringifyNum(actual));
         });
       }
     }
@@ -434,11 +444,14 @@ class ExhaustiveOpTest
       LOG(ERROR) << err_generator();
     } else if (*mismatches == kMaxMismatchesLoggedToErr) {
       LOG(ERROR) << "Not printing any more mismatches; pass "
-                    "--vmodule=exhaustive_f32__op_test=2 to see "
+                    "--vmodule=exhaustive_op_test=2 to see "
                     "all of them.";
     }
   }
 
+  // Sets error parameters appropriately for testing sin/cos/tan.
+  void SetParamsForSinCosTan();
+
   // The following members are set during construction so testcases can read
   // these values and use them e.g. to influence the values given to the mutable
   // members below.
@@ -452,10 +465,9 @@ class ExhaustiveOpTest
   // Tests can set the following variables for control over execution.  This is
   // safe because each XLA_TEST_P instantiates a new instance of this class.
 
-  // Testing will ignore the given range (encoded as bitwise representations of
-  // the type under test zero-extended to int64).
-  int64 known_incorrect_begin_ = 0;
-  int64 known_incorrect_end_ = 0;
+  // Testing will ignore inputs for which known_incorect_fn_ returns true.  (Its
+  // argument is the type under test, e.g. f32, zero-extended to int64).
+  std::function<bool(int64)> known_incorrect_fn_;
 
   // If unset, reasonable defaults will be used depending on the type under
   // test.
@@ -496,40 +508,39 @@ XLA_TEST_P(ExhaustiveOpTest, Log1p) {
 }
 
 XLA_TEST_P(ExhaustiveOpTest, Exp) {
-  if (platform_ == "Host" && ty_ == F32) {
-    // TODO(b/73142289): The vectorized Exp implementation gives results outside
-    // our error spec in this range.
-    known_incorrect_begin_ = 1107296256 + 11583654;
-    known_incorrect_end_ = 1107296256 + 11629080;
-  } else if (platform_ == "Host" && ty_ == BF16) {
-    // TODO(jlebar): Is this a rounding error?  Why doesn't it occur on XLA:GPU?
-    //
-    // Mismatch on 88.5 (0x42b1).
-    //   Expected 2.72491739e+38 (0x7f4d), but got inf (0x7f80).
-    known_incorrect_begin_ = 0x42b1;
-    known_incorrect_end_ = 0x42b2;
+  // Our CPU implementation of exp returns one incorrect value: says
+  // exp(88.7228394) = max-float, but the correct answer is inf.  We deem this
+  // acceptable and check for it explicitly so that we can be aware if anything
+  // changes.
+  if (platform_ == "Host") {
+    auto host_exp_with_overflow = +[](float f) {
+      if (f == 88.7228394f) {
+        return 3.40282347e+38f;
+      }
+      return std::exp(f);
+    };
+    Run(Exp, host_exp_with_overflow);
+  } else {
+    Run(Exp, std::exp);
   }
-
-  Run(Exp, std::exp);
 }
 
 XLA_TEST_P(ExhaustiveOpTest, Expm1) {
-  // Expm1 has the same erroneous behavior on CPU as Exp.
-  if (platform_ == "Host" && ty_ == F32) {
-    // TODO(b/73142289): The vectorized Exp implementation gives results outside
-    // our error spec in this range.
-    known_incorrect_begin_ = 1107296256 + 11583654;
-    known_incorrect_end_ = 1107296256 + 11629080;
-  } else if (platform_ == "Host" && ty_ == BF16) {
-    // TODO(jlebar): Is this a rounding error?  Why doesn't it occur on XLA:GPU?
-    //
-    // Mismatch on 88.5 (0x42b1).
-    //   Expected 2.72491739e+38 (0x7f4d), but got inf (0x7f80).
-    known_incorrect_begin_ = 0x42b1;
-    known_incorrect_end_ = 0x42b2;
+  // Our CPU implementation of expm1 returns one incorrect value: says
+  // exp(88.7228394) = max-float, but the correct answer is inf.  We deem this
+  // acceptable and check for it explicitly so that we can be aware if anything
+  // changes.
+  if (platform_ == "Host") {
+    auto host_expm1_with_overflow = +[](float f) {
+      if (f == 88.7228394f) {
+        return 3.40282347e+38f;
+      }
+      return std::expm1(f);
+    };
+    Run(Expm1, host_expm1_with_overflow);
+  } else {
+    Run(Expm1, std::expm1);
   }
-
-  Run(Expm1, std::expm1);
 }
 
 // It feels a little overkill to exhaustively test sqrt and pow(x, 0.5), but
@@ -572,17 +583,90 @@ XLA_TEST_P(ExhaustiveOpTest, Asinh) {
   Run(Asinh, std::asinh);
 }
 XLA_TEST_P(ExhaustiveOpTest, Atanh) { Run(Atanh, std::atanh); }
+XLA_TEST_P(ExhaustiveOpTest, Acos) { Run(Acos, std::acos); }
+XLA_TEST_P(ExhaustiveOpTest, Asin) { Run(Asin, std::asin); }
+
+XLA_TEST_P(ExhaustiveOpTest, Cosh) {
+  // Our cosh implementation incorrectly overflows to inf for +/-89.4159851.
+  // The correct answer of 3.40281961e+38 (0x7f7fffec) is very close to
+  // max-float, so we deem this acceptable.
+  //
+  // This does not occur on CPU because we have an offsetting error in our
+  // implementation of exp.
+  float (*host_cosh)(float);
+  if (platform_ == "Host") {
+    host_cosh = &std::cosh;
+  } else {
+    host_cosh = +[](float x) {
+      if (std::abs(x) == 89.4159851f) {
+        return std::numeric_limits<float>::infinity();
+      }
+      return std::cosh(x);
+    };
+  }
+  Run(Cosh, host_cosh);
+}
+XLA_TEST_P(ExhaustiveOpTest, Sinh) {
+  // Our sinh implementation incorrectly overflows to +/-inf for +/-89.4159851.
+  // The correct answer of 3.40281961e+38 (0x7f7fffec) is very close to
+  // max-float, so we deem this acceptable.
+  //
+  // This does not occur on CPU because we have an offsetting error in our
+  // implementation of exp.
+  float (*host_sinh)(float);
+  if (platform_ == "Host") {
+    host_sinh = &std::sinh;
+  } else {
+    host_sinh = +[](float x) {
+      if (std::abs(x) == 89.4159851f) {
+        return std::copysign(std::numeric_limits<float>::infinity(), x);
+      }
+      return std::sinh(x);
+    };
+  }
+  Run(Sinh, host_sinh);
+}
+XLA_TEST_P(ExhaustiveOpTest, Tanh) { Run(Tanh, std::tanh); }
+
+void ExhaustiveOpTest::SetParamsForSinCosTan() {
+  if (platform_ == "Host" || platform_ == "CUDA") {
+    return;
+  }
+
+  // Non CPU/GPU targets may have used the Cody-Waite range reduction technique
+  // and will not provide meaningful results for sin/cos/tan if magnitudes
+  // exceed 2**p.
+  if (ty_ == F32) {
+    rel_err_ = 0.001;
+    abs_err_ = 0.001;
+    known_incorrect_fn_ = [](int64 v) {
+      float f = absl::bit_cast<float>(static_cast<uint32>(v));
+      return std::abs(f) > (1 << 13);
+    };
+  } else if (ty_ == BF16) {
+    known_incorrect_fn_ = [](int64 v) {
+      float f =
+          static_cast<float>(absl::bit_cast<bfloat16>(static_cast<uint16>(v)));
+      return std::abs(f) > (1 << 13);
+    };
+  }
+}
+
+XLA_TEST_P(ExhaustiveOpTest, Cos) {
+  SetParamsForSinCosTan();
+  Run(Cos, std::cos);
+}
+XLA_TEST_P(ExhaustiveOpTest, Sin) {
+  SetParamsForSinCosTan();
+  Run(Sin, std::sin);
+}
+XLA_TEST_P(ExhaustiveOpTest, Tan) {
+  SetParamsForSinCosTan();
+  Run(Tan, std::tan);
+}
 
 // TODO(jlebar): Enable these.
-// XLA_TEST_P(ExhaustiveOpTest, Acos) { Run(Acos, std::acos); }
-// XLA_TEST_P(ExhaustiveOpTest, Asin) { Run(Asin, std::asin); }
 // XLA_TEST_P(ExhaustiveOpTest, Atan) { Run(Atan, std::atan); }
-// XLA_TEST_P(ExhaustiveOpTest, Cosh) { Run(Cosh, std::cosh); }
-// XLA_TEST_P(ExhaustiveOpTest, Cos) { Run(Cos, std::cos); }
-// XLA_TEST_P(ExhaustiveOpTest, Sinh) { Run(Sinh, std::sinh); }
-// XLA_TEST_P(ExhaustiveOpTest, Sin) { Run(Sin, std::sin); }
-// XLA_TEST_P(ExhaustiveOpTest, Tanh) { Run(Tanh, std::tanh); }
-// XLA_TEST_P(ExhaustiveOpTest, Tan) { Run(Tan, std::tan); }
 // XLA_TEST_P(ExhaustiveOpTest, Atan2) { Run(Atan2, std::atan2); }
 
 XLA_TEST_P(ExhaustiveOpTest, Erf) { Run(Erf, std::erf); }
@@ -623,19 +707,24 @@ XLA_TEST_P(ExhaustiveOpTest, Lgamma) {
   if (platform_ == "CUDA" && (ty_ == F32 || ty_ == F16)) {
     rel_err_ = 0.001;
   }
+  float (*host_lgamma)(float) = std::lgamma;
   if (platform_ != "Host" && platform_ != "CUDA") {
     // TODO(b/123956399): This is a fairly high error, significantly higher than
     // we see on CPU/GPU.
     rel_err_ = 0.01;
     abs_err_ = 0.01;
 
-    // Overflows for to inf for input 4.08500343e+36 (0x7c44af8e).
+    // Overflows to inf for input 4.08500343e+36 (0x7c44af8e).
     if (ty_ == F32) {
-      known_incorrect_begin_ = 0x7c44af8e;
-      known_incorrect_end_ = 0x7c44af8e + 1;
+      host_lgamma = +[](float v) {
+        if (absl::bit_cast<uint32>(v) == 0x7c44af8e) {
+          return std::numeric_limits<float>::infinity();
+        }
+        return std::lgamma(v);
+      };
     }
   }
-  Run(Lgamma, std::lgamma);
+  Run(Lgamma, host_lgamma);
 }
 
 XLA_TEST_P(ExhaustiveOpTest, Round) { Run(Round, std::round); }
diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc
index f4a7309adc9..2d0805cdb0e 100644
--- a/tensorflow/compiler/xla/tests/fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/fusion_test.cc
@@ -829,7 +829,7 @@ void BM_ParallelFusion(int num_iters) {
 
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
   auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
-  StreamExecutorMemoryAllocator allocator(platform, executors);
+  se::StreamExecutorMemoryAllocator allocator(platform, executors);
 
   const int64 intra_op_parallelism_threads = 24;
   xla::LocalClientOptions client_options;
diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
index 2d4d480cd48..67a1abacd18 100644
--- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
@@ -41,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 namespace {
@@ -902,7 +902,7 @@ void BM_LocalClientOverhead(int num_iters) {
 
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
   auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
-  StreamExecutorMemoryAllocator allocator(platform, executors);
+  se::StreamExecutorMemoryAllocator allocator(platform, executors);
   LocalClient* client =
       ClientLibrary::GetOrCreateLocalClient(platform).ValueOrDie();
   auto* transfer_manager =
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc
index 710d8ae40aa..7eaa2791d47 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.cc
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc
@@ -35,17 +35,16 @@ namespace xla {
 
 /* static */ TestAllocator* LocalClientTestBase::allocator_;
 
-StatusOr<OwningDeviceMemory> TestAllocator::Allocate(int device_ordinal,
-                                                     uint64 size,
-                                                     bool retry_on_failure) {
+StatusOr<se::OwningDeviceMemory> TestAllocator::Allocate(
+    int device_ordinal, uint64 size, bool retry_on_failure) {
   VLOG(2) << "Allocate(" << device_ordinal << ", " << size << ")";
   {
     tensorflow::mutex_lock lock(count_mutex_);
     allocation_count_++;
     device_allocation_count_[device_ordinal]++;
   }
-  return StreamExecutorMemoryAllocator::Allocate(device_ordinal, size,
-                                                 retry_on_failure);
+  return se::StreamExecutorMemoryAllocator::Allocate(device_ordinal, size,
+                                                     retry_on_failure);
 }
 
 Status TestAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase mem) {
@@ -55,7 +54,7 @@ Status TestAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase mem) {
     deallocation_count_++;
     device_deallocation_count_[device_ordinal]++;
   }
-  return StreamExecutorMemoryAllocator::Deallocate(device_ordinal, mem);
+  return se::StreamExecutorMemoryAllocator::Deallocate(device_ordinal, mem);
 }
 
 int64 TestAllocator::allocation_count() const {
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.h b/tensorflow/compiler/xla/tests/local_client_test_base.h
index 4027c7b124f..292baacf969 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.h
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.h
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
@@ -36,18 +35,19 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 
-class TestAllocator : public StreamExecutorMemoryAllocator {
+class TestAllocator : public se::StreamExecutorMemoryAllocator {
  public:
   explicit TestAllocator(se::Platform* platform)
-      : StreamExecutorMemoryAllocator(
+      : se::StreamExecutorMemoryAllocator(
             platform, PlatformUtil::GetStreamExecutors(platform).ValueOrDie()) {
   }
 
-  StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
-                                        bool retry_on_failure) override;
+  StatusOr<se::OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
+                                            bool retry_on_failure) override;
   Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override;
 
   // Return the number of allocations that have been performed.
diff --git a/tensorflow/compiler/xla/tests/select_and_scatter_test.cc b/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
index 0dcb1c42db1..4b3283b5cd7 100644
--- a/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
+++ b/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
@@ -84,7 +84,7 @@ XLA_TEST_P(SelectAndScatterTest, ParamTest) {
                    GetParam().window_strides, GetParam().padding_type, source,
                    ConstantR0<float>(&builder_, 0.0f), add_f32_);
 
-  ComputeAndCompare(&builder_, {}, ErrorSpec(1e-5));
+  ComputeAndCompare(&builder_, {}, ErrorSpec(1e-4));
 }
 
 INSTANTIATE_TEST_CASE_P(
@@ -199,7 +199,10 @@ INSTANTIATE_TEST_CASE_P(
         SelectAndScatterTestParam{
             {1, 5, 5}, {1, 5, 5}, Padding::kSame, {3, 1, 1}, {3, 1, 1}},
         SelectAndScatterTestParam{
-            {7, 8, 256}, {4, 8, 256}, Padding::kSame, {2, 1, 1}, {2, 1, 1}}));
+            {7, 8, 256}, {4, 8, 256}, Padding::kSame, {2, 1, 1}, {2, 1, 1}},
+        SelectAndScatterTestParam{{1104}, {551}, Padding::kValid, {3}, {2}},
+        SelectAndScatterTestParam{
+            {1300}, {1171}, Padding::kValid, {130}, {1}}));
 
 // Test for F32 1D array, with a zero-element input.
 XLA_TEST_F(SelectAndScatterTest, R1S0F32) {
diff --git a/tensorflow/compiler/xla/tests/test_macros.cc b/tensorflow/compiler/xla/tests/test_macros.cc
index a9874a91865..4241d813356 100644
--- a/tensorflow/compiler/xla/tests/test_macros.cc
+++ b/tensorflow/compiler/xla/tests/test_macros.cc
@@ -18,9 +18,8 @@ limitations under the License.
 #include <fstream>
 #include <streambuf>
 #include <string>
-#include <unordered_map>
 
-#include "absl/strings/ascii.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
 #include "tensorflow/core/platform/logging.h"
@@ -31,7 +30,7 @@ namespace {
 
 // Mapping from test name; i.e. MyTest.MyTestCase to platforms on which it is
 // disabled - a sequence of regexps.
-using ManifestT = std::unordered_map<string, std::vector<string>>;
+using ManifestT = absl::flat_hash_map<string, std::vector<string>>;
 
 ManifestT ReadManifest() {
   ManifestT manifest;
@@ -68,10 +67,21 @@ ManifestT ReadManifest() {
 
 }  // namespace
 
-string PrependDisabledIfIndicated(const string& test_case_name,
-                                  const string& test_name) {
+std::string PrependDisabledIfIndicated(absl::string_view test_case_name,
+                                       absl::string_view test_name) {
   ManifestT manifest = ReadManifest();
 
+  // If the test name ends with a slash followed by one or more digits, strip
+  // that off; this is just a shard number, and matching on this would be
+  // unstable even if someone wanted to do it.
+  static auto* shard_num_pattern = new RE2(R"(/\d+$)");
+  tensorflow::RegexpStringPiece suffix;
+  if (RE2::PartialMatch(
+          tensorflow::RegexpStringPiece(test_name.data(), test_name.size()),
+          *shard_num_pattern, &suffix)) {
+    test_name.remove_suffix(suffix.size());
+  }
+
   // First try full match: test_case_name.test_name
   // If that fails, try to find just the test_case_name; this would disable all
   // tests in the test case.
@@ -79,7 +89,7 @@ string PrependDisabledIfIndicated(const string& test_case_name,
   if (it == manifest.end()) {
     it = manifest.find(test_case_name);
     if (it == manifest.end()) {
-      return test_name;
+      return std::string(test_name);
     }
   }
 
@@ -88,12 +98,12 @@ string PrependDisabledIfIndicated(const string& test_case_name,
   string platform_string = XLA_PLATFORM;
   for (const auto& s : disabled_platforms) {
     if (RE2::FullMatch(/*text=*/platform_string, /*re=*/s)) {
-      return "DISABLED_" + test_name;
+      return absl::StrCat("DISABLED_", test_name);
     }
   }
 
   // We didn't hit in the disabled manifest entries, so don't disable it.
-  return test_name;
+  return std::string(test_name);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/test_macros.h b/tensorflow/compiler/xla/tests/test_macros.h
index 80a6868485c..9636df2ff5f 100644
--- a/tensorflow/compiler/xla/tests/test_macros.h
+++ b/tensorflow/compiler/xla/tests/test_macros.h
@@ -30,6 +30,7 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -68,8 +69,8 @@ namespace xla {
 // disabled on a particular platform. For a test that should be disabled,
 // returns DISABLED_ prepended to its name; otherwise returns the test name
 // unmodified.
-string PrependDisabledIfIndicated(const string& test_case_name,
-                                  const string& test_name);
+std::string PrependDisabledIfIndicated(absl::string_view test_case_name,
+                                       absl::string_view test_name);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/tests/transfer_manager_test.cc b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
index c27ab5af76e..00b72cedbf5 100644
--- a/tensorflow/compiler/xla/tests/transfer_manager_test.cc
+++ b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/generic_transfer_manager.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/service/stream_pool.h"
@@ -34,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 namespace {
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index 85212fa56d7..4d80a57ad40 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -1265,7 +1265,7 @@ void BM_WhileLoop(int num_iters) {
 
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
   auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
-  StreamExecutorMemoryAllocator allocator(platform, executors);
+  se::StreamExecutorMemoryAllocator allocator(platform, executors);
   LocalClient* client =
       ClientLibrary::GetOrCreateLocalClient(platform).ValueOrDie();
 
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index 7b7b8f5d02d..b36fc4174ae 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -135,7 +135,7 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
   LocalService* service = ClientLibrary::GetXlaService(client->platform());
   Backend* backend = service->mutable_backend();
   se::StreamExecutor* executor = backend->default_stream_executor();
-  DeviceMemoryAllocator* allocator = backend->memory_allocator();
+  se::DeviceMemoryAllocator* allocator = backend->memory_allocator();
   auto* transfer_manager = backend->transfer_manager();
   TF_ASSERT_OK_AND_ASSIGN(
       StreamPool::Ptr stream_ptr,
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index d66561315b4..3d443beeecb 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -271,7 +271,7 @@ StatusOr<Literal> ReplayComputation(const HloSnapshot& module,
   // Run the computation num_runs times, and return the result from the last
   // execution.
   const bool xla_hlo_profile = GetDebugOptionsFromFlags().xla_hlo_profile();
-  StreamExecutorMemoryAllocator allocator(
+  se::StreamExecutorMemoryAllocator allocator(
       client->platform(),
       {client->platform()->ExecutorForDevice(0).ValueOrDie()});
   absl::optional<ScopedShapedBuffer> final_result;
diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc
index f9a1259ef85..732b7f2efd7 100644
--- a/tensorflow/compiler/xla/util.cc
+++ b/tensorflow/compiler/xla/util.cc
@@ -111,7 +111,7 @@ std::vector<int64> InversePermutation(
   DCHECK(IsPermutation(input_permutation, input_permutation.size()));
   std::vector<int64> output_permutation(input_permutation.size(), -1);
   for (size_t i = 0; i < input_permutation.size(); ++i) {
-    output_permutation[input_permutation[i]] = i;
+    output_permutation.at(input_permutation.at(i)) = i;
   }
   return output_permutation;
 }
@@ -121,7 +121,7 @@ std::vector<int64> ComposePermutations(absl::Span<const int64> p1,
   CHECK_EQ(p1.size(), p2.size());
   std::vector<int64> output;
   for (size_t i = 0; i < p1.size(); ++i) {
-    output.push_back(p1[p2[i]]);
+    output.push_back(p1.at(p2.at(i)));
   }
   return output;
 }
diff --git a/tensorflow/compiler/xla/xla.bzl b/tensorflow/compiler/xla/xla.bzl
index 562ea051418..16ef0caf29b 100644
--- a/tensorflow/compiler/xla/xla.bzl
+++ b/tensorflow/compiler/xla/xla.bzl
@@ -8,7 +8,10 @@ load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
     "if_static",
 )
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
+load(
+    "//tensorflow/core:platform/default/cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
 
 # xla_proto_library() is a convenience wrapper around cc_proto_library.
 def xla_proto_library(name, srcs = [], deps = [], visibility = None, testonly = 0, **kwargs):
diff --git a/tensorflow/compiler/xrt/BUILD b/tensorflow/compiler/xrt/BUILD
index 4320a4c5eae..acd984f9e99 100644
--- a/tensorflow/compiler/xrt/BUILD
+++ b/tensorflow/compiler/xrt/BUILD
@@ -67,13 +67,13 @@ cc_library(
         "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service:backend",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
diff --git a/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc b/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
index 343f43b7159..9020fe8ea78 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
@@ -147,4 +147,9 @@ REGISTER_KERNEL_BUILDER(Name("XRTReleaseAllAllocations").Device(DEVICE_XLA_GPU),
 REGISTER_KERNEL_BUILDER(Name("XRTReleaseAllAllocations").Device(DEVICE_XLA_CPU),
                         XRTReleaseAllAllocationsOp<XRTGenericDeviceAccessor>);
 
+REGISTER_KERNEL_BUILDER(Name("XRTCompactAllocations").Device(DEVICE_XLA_GPU),
+                        XRTCompactAllocationsOp<XRTGenericDeviceAccessor>);
+REGISTER_KERNEL_BUILDER(Name("XRTCompactAllocations").Device(DEVICE_XLA_CPU),
+                        XRTCompactAllocationsOp<XRTGenericDeviceAccessor>);
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h b/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
index 6af73ecc853..8a54e0987e5 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
+++ b/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
@@ -688,6 +688,27 @@ class XRTReleaseAllAllocationsOp : public OpKernel {
   }
 };
 
+template <class DeviceAccessor>
+class XRTCompactAllocationsOp : public OpKernel {
+ public:
+  explicit XRTCompactAllocationsOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  ~XRTCompactAllocationsOp() override = default;
+  XRTCompactAllocationsOp(const XRTCompactAllocationsOp&) = delete;
+  XRTCompactAllocationsOp& operator=(const XRTCompactAllocationsOp&) = delete;
+
+  void Compute(OpKernelContext* ctx) override {
+    VLOG(1) << "XRTCompactAllocationsOp::Compute";
+
+    ResourceMgr* rm;
+    OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
+    class DeviceAccessor::ScopedRef device_ref;
+    OP_REQUIRES_OK(ctx, DeviceAccessor::InitScopedRef(ctx, &device_ref));
+    OP_REQUIRES_OK(ctx,
+                   XRTTupleAllocation::CompactAllocations(
+                       rm, device_ref.backend(), device_ref.device_ordinal()));
+  }
+};
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_XRT_KERNELS_XRT_STATE_OPS_H_
diff --git a/tensorflow/compiler/xrt/ops/xrt_state_ops.cc b/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
index 87546fce4e4..6d4e70fad53 100644
--- a/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
+++ b/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
@@ -191,4 +191,14 @@ REGISTER_OP("XRTReleaseAllAllocations")
 Discards all the XRT allocations. All the client held handles will be invalid.
 )");
 
+REGISTER_OP("XRTCompactAllocations")
+    .SetShapeFn(tensorflow::shape_inference::NoOutputs)
+    .Doc(
+        R"(
+Runs a device memory compaction cycle. This copies the device data behind the
+currently alive allocation handles into host memory, releases the device memory
+backing the handles, and re-allocate and send back the data to the device.
+This operation helps with device memory fragmentation.
+)");
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/tests/raw_api_test.cc b/tensorflow/compiler/xrt/tests/raw_api_test.cc
index bc07ae34eba..305b3a67fae 100644
--- a/tensorflow/compiler/xrt/tests/raw_api_test.cc
+++ b/tensorflow/compiler/xrt/tests/raw_api_test.cc
@@ -70,6 +70,14 @@ xla::LiteralProto TwoElementTuple() {
   return tuple.ToProto();
 }
 
+xla::LiteralProto BasedTwoElementTuple(float base) {
+  auto array = xla::LiteralUtil::CreateR1<float>({base, base + 1});
+  auto matrix = xla::LiteralUtil::CreateR2<float>(
+      {{base + 2, base + 3}, {base + 4, base + 5}});
+  auto tuple = xla::LiteralUtil::MakeTuple({&array, &matrix});
+  return tuple.ToProto();
+}
+
 xla::LiteralProto ScalarLiteral() {
   auto scalar = xla::LiteralUtil::CreateR0<float>(12.0f);
   return scalar.ToProto();
@@ -381,8 +389,8 @@ TEST(RawApiTest, AllocAndRewrite) {
   auto read_back = ops::XRTReadLiteral(root, handle);
   TF_ASSERT_OK(root.status());
 
-  tensorflow::ClientSession session(root);
-  std::vector<tensorflow::Tensor> outputs;
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
   TF_EXPECT_OK(session.Run({read_back, handle}, &outputs));
   EXPECT_EQ(outputs.size(), 2);
 
@@ -414,8 +422,7 @@ TEST(RawApiTest, AllocAndRewrite) {
   release_tensor.flat<int64>()(0) = allocation_handle;
 
   auto release = ops::XRTReleaseAllocationHandle(root, release_tensor);
-  TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {}, {release},
-                           &outputs));
+  TF_EXPECT_OK(session.Run(ClientSession::FeedType(), {}, {release}, &outputs));
 }
 
 TEST(RawApiTest, AllocReleaseMany) {
@@ -435,8 +442,8 @@ TEST(RawApiTest, AllocReleaseMany) {
   auto handle2 = ops::XRTAllocate(root, value2);
   TF_ASSERT_OK(root.status());
 
-  tensorflow::ClientSession session(root);
-  std::vector<tensorflow::Tensor> outputs;
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
   TF_EXPECT_OK(session.Run({handle1, handle2}, &outputs));
   EXPECT_EQ(outputs.size(), 2);
 
@@ -448,8 +455,7 @@ TEST(RawApiTest, AllocReleaseMany) {
   release_tensor.flat<int64>()(1) = allocation_handle2;
 
   auto release = ops::XRTReleaseAllocationHandle(root, release_tensor);
-  TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {}, {release},
-                           &outputs));
+  TF_EXPECT_OK(session.Run(ClientSession::FeedType(), {}, {release}, &outputs));
 }
 
 TEST(RawApiTest, CompileAndReleaseMany) {
@@ -498,8 +504,7 @@ TEST(RawApiTest, CompileAndReleaseMany) {
   release_tensor.flat<int64>()(1) = compilation_handle2;
 
   auto release = ops::XRTReleaseCompilationHandle(root, release_tensor);
-  TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {}, {release},
-                           &outputs));
+  TF_EXPECT_OK(session.Run(ClientSession::FeedType(), {}, {release}, &outputs));
 }
 
 TEST(RawApiTest, AllocAndClearAll) {
@@ -513,8 +518,8 @@ TEST(RawApiTest, AllocAndClearAll) {
   auto handle = ops::XRTAllocate(root, value);
   TF_ASSERT_OK(root.status());
 
-  tensorflow::ClientSession session(root);
-  std::vector<tensorflow::Tensor> outputs;
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
   TF_EXPECT_OK(session.Run({handle}, &outputs));
   EXPECT_EQ(outputs.size(), 1);
 
@@ -522,13 +527,13 @@ TEST(RawApiTest, AllocAndClearAll) {
 
   auto clear_all = ops::XRTReleaseAllAllocations(root);
 
-  TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {},
-                           {clear_all}, &outputs));
+  TF_EXPECT_OK(
+      session.Run(ClientSession::FeedType(), {}, {clear_all}, &outputs));
   EXPECT_EQ(outputs.size(), 0);
 
   auto read_after_clear = ops::XRTReadLiteral(root, Input(allocation_handle));
   EXPECT_EQ(session.Run({read_after_clear}, &outputs).code(),
-            tensorflow::error::Code::NOT_FOUND);
+            error::Code::NOT_FOUND);
 }
 
 TEST(RawApiTest, ReadAndWriteState) {
@@ -544,10 +549,10 @@ TEST(RawApiTest, ReadAndWriteState) {
       root.WithControlDependencies(read_back), handle);
   TF_ASSERT_OK(root.status());
 
-  tensorflow::ClientSession session(root);
-  std::vector<tensorflow::Tensor> outputs;
-  TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {read_back},
-                           {release}, &outputs));
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(
+      session.Run(ClientSession::FeedType(), {read_back}, {release}, &outputs));
 
   xla::LiteralProto response;
   EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
@@ -1022,8 +1027,8 @@ TEST(RawApiTest, CompileWithXlaReturnShapes) {
 
   ClientSession session(root);
   std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(),
-                           {c_handle.program_shape}, {release}, &outputs));
+  TF_EXPECT_OK(session.Run(ClientSession::FeedType(), {c_handle.program_shape},
+                           {release}, &outputs));
 
   xla::ProgramShapeProto program_shape_proto;
   EXPECT_TRUE(program_shape_proto.ParseFromString(outputs[0].vec<string>()(0)));
@@ -1337,8 +1342,8 @@ TEST(RawApiTest, CompileAndExecuteWithReusedBuffers) {
       root.WithControlDependencies(read_back), result);
   TF_ASSERT_OK(root.status());
 
-  TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {read_back},
-                           {release}, &outputs));
+  TF_EXPECT_OK(
+      session.Run(ClientSession::FeedType(), {read_back}, {release}, &outputs));
 
   xla::Literal exec_literal = ReadOutputLiteral(outputs, 0);
   auto exec_literal_parts = exec_literal.DecomposeTuple();
@@ -1355,7 +1360,7 @@ TEST(RawApiTest, CompileAndExecuteWithReusedBuffers) {
       root.WithControlDependencies(read_handle), Input(alloc_handle));
   TF_ASSERT_OK(root.status());
 
-  TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {read_handle},
+  TF_EXPECT_OK(session.Run(ClientSession::FeedType(), {read_handle},
                            {release_handle}, &outputs));
 
   xla::Literal return_literal = ReadOutputLiteral(outputs, 0);
@@ -1424,6 +1429,65 @@ TEST(RawApiTest, CompileAndExecuteWithS64Argument) {
       xla::Shape(program_shape.result()), xla::S64));
 }
 
+// Tests the XRT device memory compation API (XRTCompactAllocations).
+TEST(RawApiTest, TestDeviceMemoryCompaction) {
+  static const int kNumAllocs = 32;
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+
+  std::vector<xrt::XLAAllocation> allocs(kNumAllocs);
+  std::vector<Output> handle_outputs;
+  for (int i = 0; i < kNumAllocs; ++i) {
+    *allocs[i].mutable_value() = BasedTwoElementTuple(i * 4.0f);
+    auto value = ops::Const(root.WithDevice("/device:CPU:0"),
+                            allocs[i].SerializeAsString());
+    handle_outputs.push_back(ops::XRTAllocate(root, value));
+  }
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run(handle_outputs, &outputs));
+  EXPECT_EQ(outputs.size(), handle_outputs.size());
+
+  std::vector<int64> handles;
+  for (auto& output : outputs) {
+    handles.push_back(output.scalar<int64>()());
+  }
+  // Create holes by releasing even allocations.
+  std::vector<Operation> handle_releases;
+  for (size_t i = 0; i < handles.size(); i += 2) {
+    handle_releases.push_back(
+        ops::XRTReleaseAllocationHandle(root, Input(handles[i])));
+  }
+  TF_ASSERT_OK(root.status());
+
+  TF_EXPECT_OK(
+      session.Run(ClientSession::FeedType(), {}, handle_releases, &outputs));
+
+  // Run the compaction API.
+  auto compact_op = ops::XRTCompactAllocations(root);
+  TF_EXPECT_OK(
+      session.Run(ClientSession::FeedType(), {}, {compact_op}, &outputs));
+
+  // Read back the allocation left at odd indices.
+  std::vector<Output> read_outputs;
+  for (size_t i = 1; i < handles.size(); i += 2) {
+    read_outputs.push_back(ops::XRTReadLiteral(root, Input(handles[i])));
+  }
+  TF_ASSERT_OK(root.status());
+
+  TF_EXPECT_OK(session.Run(read_outputs, &outputs));
+  EXPECT_EQ(outputs.size(), read_outputs.size());
+
+  // Verify that everything got moved correctly and the device data matches what
+  // we have on record.
+  for (size_t i = 1, j = 0; i < handles.size(); i += 2, ++j) {
+    xla::LiteralProto response;
+    EXPECT_TRUE(response.ParseFromString(outputs[j].scalar<string>()()));
+    EXPECT_TRUE(CompareLiteralProtos(allocs[i].value(), response));
+  }
+}
+
 }  // namespace
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/xrt_state.cc b/tensorflow/compiler/xrt/xrt_state.cc
index 2ae6f964623..4b5fe42487d 100644
--- a/tensorflow/compiler/xrt/xrt_state.cc
+++ b/tensorflow/compiler/xrt/xrt_state.cc
@@ -117,7 +117,7 @@ Status AllocateScopedShapedBuffer(
         xla::ShapeUtil::GetSubshape(on_device_shape, index_to_buffer.first);
     uint64 size = transfer_manager->GetByteSizeRequirement(subshape);
     TF_ASSIGN_OR_RETURN(
-        xla::OwningDeviceMemory buffer,
+        se::OwningDeviceMemory buffer,
         allocator->Allocate(device_ordinal, size, /*retry_on_failure=*/false));
     // Move our buffer into shaped_buffer, which takes ownership of it.
     index_to_buffer.second = buffer.Forget();
@@ -135,7 +135,7 @@ Status AllocateScopedShapedBuffer(
 
 XRTBufferAllocation::XRTBufferAllocation(const se::DeviceMemoryBase& allocation,
                                          int device_ordinal,
-                                         xla::DeviceMemoryAllocator* allocator)
+                                         se::DeviceMemoryAllocator* allocator)
     : size_(allocation.size()),
       allocation_(allocation),
       device_ordinal_(device_ordinal),
@@ -169,7 +169,7 @@ void XRTBufferAllocation::DiscardAllocation() {
 }
 
 XRTTupleAllocation::XRTTupleAllocation(int device_ordinal,
-                                       xla::DeviceMemoryAllocator* allocator,
+                                       se::DeviceMemoryAllocator* allocator,
                                        const xla::Shape& on_host_shape,
                                        const xla::Shape& on_device_shape)
     : device_ordinal_(device_ordinal),
@@ -340,9 +340,41 @@ typedef XRTBufferAllocation* XRTBufferAllocationPtr;
   return Status::OK();
 }
 
+/* static */ Status XRTTupleAllocation::CompactAllocations(
+    ResourceMgr* rm, xla::Backend* backend, int device_ordinal) {
+  std::vector<ResourceMgr::ResourceEntry> tuples;
+  rm->GetContainerResources(kTupleContainer, &tuples);
+
+  std::vector<std::pair<string, xla::Literal>> host_tuples;
+  for (auto& rm_tuple : tuples) {
+    XRTTupleAllocation* tuple =
+        dynamic_cast<XRTTupleAllocation*>(rm_tuple.resource.get());
+    if (tuple->device_ordinal() == device_ordinal) {
+      xla::Literal literal(tuple->on_host_shape());
+      TF_RETURN_IF_ERROR(tuple->ToLiteral(backend, device_ordinal, &literal));
+      host_tuples.emplace_back(rm_tuple.name, std::move(literal));
+      // At this point there are two references held onto the XRTTupleAllocation
+      // object. One in the ResourceMgr, which we release here, and one held
+      // within the tuples vector, which we release in the tuples.clear() call
+      // below.
+      TF_RETURN_IF_ERROR(
+          rm->Delete<XRTTupleAllocation>(kTupleContainer, rm_tuple.name));
+    }
+  }
+  tuples.clear();
+
+  for (auto& name_literal : host_tuples) {
+    XRTTupleAllocation* tuple;
+    TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateAndTransfer(
+        name_literal.second, backend, device_ordinal, &tuple));
+    TF_RETURN_IF_ERROR(rm->Create(kTupleContainer, name_literal.first, tuple));
+  }
+  return Status::OK();
+}
+
 /* static */ Status XRTTupleAllocation::ExpandTreeOfTuples(
     const xla::ShapeTree<ExpandedTupleInput>& elements, int device_ordinal,
-    xla::DeviceMemoryAllocator* allocator, xla::Shape* host_shape,
+    se::DeviceMemoryAllocator* allocator, xla::Shape* host_shape,
     xla::Shape* device_shape) {
   // Initialize both host and device shape to be the 'spine' of the new tuple
   // shape, given by the shape of the tree of tuples.
@@ -415,7 +447,7 @@ typedef XRTBufferAllocation* XRTBufferAllocationPtr;
           xla::Shape subshape =
               xla::ShapeUtil::GetSubshape(device_shape, index);
           uint64 size = transfer_manager->GetByteSizeRequirement(subshape);
-          TF_ASSIGN_OR_RETURN(xla::OwningDeviceMemory buffer,
+          TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory buffer,
                               allocator->Allocate(device_ordinal, size,
                                                   /*retry_on_failure=*/false));
           VLOG(2) << "Allocated buffer at " << buffer.opaque() << " index "
@@ -502,7 +534,7 @@ bool XRTTupleAllocation::IsExclusiveOwner() {
 
 void XRTTupleAllocation::InitializeFromShapedBuffer(
     const xla::ShapedBuffer& shaped_buffer,
-    xla::DeviceMemoryAllocator* allocator, int device_ordinal) {
+    se::DeviceMemoryAllocator* allocator, int device_ordinal) {
   for (auto& buffer : buffers_) {
     // Make a reference-counted version of the allocated buffer.
     buffer.second = new XRTBufferAllocation(shaped_buffer.buffer(buffer.first),
@@ -549,7 +581,7 @@ XRTTupleAllocation::ToDeviceMemoryTree(
     if (!release_checker(buffer.first)) {
       *shaped_tree.mutable_element(buffer.first) = buffer.second->allocation();
     } else {
-      *shaped_tree.mutable_element(buffer.first) = xla::OwningDeviceMemory(
+      *shaped_tree.mutable_element(buffer.first) = se::OwningDeviceMemory(
           buffer.second->allocation(), device_ordinal_, allocator_);
       DiscardAllocation(buffer.first);
     }
diff --git a/tensorflow/compiler/xrt/xrt_state.h b/tensorflow/compiler/xrt/xrt_state.h
index 38dcf3d2891..4d284382532 100644
--- a/tensorflow/compiler/xrt/xrt_state.h
+++ b/tensorflow/compiler/xrt/xrt_state.h
@@ -25,7 +25,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/backend.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -34,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 
 namespace tensorflow {
@@ -45,8 +45,7 @@ namespace tensorflow {
 class XRTBufferAllocation : public core::RefCounted {
  public:
   XRTBufferAllocation(const se::DeviceMemoryBase& allocation,
-                      int device_ordinal,
-                      xla::DeviceMemoryAllocator* allocator);
+                      int device_ordinal, se::DeviceMemoryAllocator* allocator);
   ~XRTBufferAllocation() override;
 
   // The region of device memory being wrapped.
@@ -69,7 +68,7 @@ class XRTBufferAllocation : public core::RefCounted {
   uint64 size_ = 0;
   se::DeviceMemoryBase allocation_;
   int device_ordinal_;
-  xla::DeviceMemoryAllocator* allocator_;
+  se::DeviceMemoryAllocator* allocator_;
 };
 
 // Entry in the resource manager corresponding to an allocation handle returned
@@ -107,6 +106,11 @@ class XRTTupleAllocation : public ResourceBase {
                               XRTTupleAllocation** allocation,
                               bool alias_parent_allocation);
 
+  // Runs a compaction cycle which copies the device data to host, frees the
+  // device data, and then reallocate and send back the data.
+  static Status CompactAllocations(ResourceMgr* rm, xla::Backend* backend,
+                                   int device_ordinal);
+
   // A structure describing a leaf of a tree of tuples to expand. Each leaf
   // contains an allocation and indicates whether or not the allocation's handle
   // should be freed after incorporating its buffers into the expanded tree.
@@ -197,14 +201,14 @@ class XRTTupleAllocation : public ResourceBase {
 
  private:
   // Creates a new handle with (tuple) shape.
-  XRTTupleAllocation(int device_ordinal, xla::DeviceMemoryAllocator* allocator,
+  XRTTupleAllocation(int device_ordinal, se::DeviceMemoryAllocator* allocator,
                      const xla::Shape& on_host_shape,
                      const xla::Shape& on_device_shape);
 
   // Inherits the allocations represented in buffer, which must have the same
   // shape as buffers_.
   void InitializeFromShapedBuffer(const xla::ShapedBuffer& shaped_buffer,
-                                  xla::DeviceMemoryAllocator* allocator,
+                                  se::DeviceMemoryAllocator* allocator,
                                   int device_ordinal);
 
   // Takes a tree 'elements' where each leaf is an allocation, validates that
@@ -214,12 +218,12 @@ class XRTTupleAllocation : public ResourceBase {
   // grafted on.
   static Status ExpandTreeOfTuples(
       const xla::ShapeTree<ExpandedTupleInput>& elements, int device_ordinal,
-      xla::DeviceMemoryAllocator* allocator, xla::Shape* host_shape,
+      se::DeviceMemoryAllocator* allocator, xla::Shape* host_shape,
       xla::Shape* device_shape);
 
   // Location of the memory that is being managed.
   int device_ordinal_;
-  xla::DeviceMemoryAllocator* allocator_;
+  se::DeviceMemoryAllocator* allocator_;
 
   // The shape that the caller thinks the tuple has.
   const xla::Shape on_host_shape_;
diff --git a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
index d9fce6e09f4..4b688f2d22f 100644
--- a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
+++ b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
@@ -432,6 +432,17 @@ BigtableTestClient::AsyncReadRows(
   return nullptr;
 }
 
+std::unique_ptr<
+    grpc::ClientAsyncReaderInterface<google::bigtable::v2::MutateRowsResponse>>
+BigtableTestClient::PrepareAsyncMutateRows(
+    grpc::ClientContext* context,
+    const google::bigtable::v2::MutateRowsRequest& request,
+    grpc::CompletionQueue* cq) {
+  LOG(WARNING) << "Call to InMemoryDataClient::" << __func__
+               << "(); this will likely cause a crash!";
+  return nullptr;
+}
+
 std::shared_ptr<grpc::Channel> BigtableTestClient::Channel() {
   LOG(WARNING) << "Call to InMemoryDataClient::Channel(); this will likely "
                   "cause a crash!";
diff --git a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h
index 63d59b32dd1..299494b7180 100644
--- a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h
+++ b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h
@@ -100,6 +100,12 @@ class BigtableTestClient : public ::google::cloud::bigtable::DataClient {
                 const google::bigtable::v2::ReadRowsRequest& request,
                 grpc::CompletionQueue* cq, void* tag) override;
 
+  std::unique_ptr<grpc::ClientAsyncReaderInterface<
+      google::bigtable::v2::MutateRowsResponse>>
+  PrepareAsyncMutateRows(grpc::ClientContext* context,
+                         const google::bigtable::v2::MutateRowsRequest& request,
+                         grpc::CompletionQueue* cq) override;
+
   std::shared_ptr<grpc::Channel> Channel() override;
 
  private:
diff --git a/tensorflow/contrib/distribute/python/keras_multi_worker_correctness_test.py b/tensorflow/contrib/distribute/python/keras_multi_worker_correctness_test.py
index 889976d71b9..190aaf85c89 100644
--- a/tensorflow/contrib/distribute/python/keras_multi_worker_correctness_test.py
+++ b/tensorflow/contrib/distribute/python/keras_multi_worker_correctness_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
 from tensorflow.python.framework import ops
+from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.platform import test
 
 
@@ -132,18 +133,19 @@ def make_image_model(initial_weights=None):
   return model, IMAGE_INPUTS, IMAGE_TARGETS
 
 
-# TODO(b/130243026): Re-enable this test.
 def make_lstm_model(initial_weights=None):
   inputs = keras.layers.Input(shape=(10, 20))
-  rnn1_out = keras.layers.LSTM(20, return_sequences=True)(inputs)
-  rnn2_out = keras.layers.LSTM(10)(rnn1_out)
-  outputs = keras.layers.Dense(1)(rnn2_out)
+  rnn_out = keras.layers.LSTM(4)(inputs)
+  outputs = keras.layers.Dense(1)(rnn_out)
   model = keras.Model(inputs, outputs)
 
   if initial_weights:
     model.set_weights(initial_weights)
 
-  model.compile('adam', 'binary_crossentropy', metrics=['mse'])
+  model.compile(
+      gradient_descent.SGD(0.1),
+      'sparse_categorical_crossentropy',
+      metrics=['sparse_categorical_crossentropy'])
 
   return model, LSTM_INPUTS, LSTM_TARGETS
 
@@ -177,7 +179,7 @@ class ModelCorrectnessTest(
           strategy_cls=[
               collective_strategy.CollectiveAllReduceStrategy,
           ],
-          make_model=[make_image_model, make_embedding_model],
+          make_model=[make_image_model, make_lstm_model, make_embedding_model],
           required_gpus=[0, 1]))
   def test_correctness(self, strategy_cls, make_model):
 
diff --git a/tensorflow/contrib/eager/python/examples/revnet/BUILD b/tensorflow/contrib/eager/python/examples/revnet/BUILD
index b973a2848d0..a48d08b8a3a 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/BUILD
+++ b/tensorflow/contrib/eager/python/examples/revnet/BUILD
@@ -86,6 +86,7 @@ cuda_py_test(
     additional_deps = [":blocks_test_main_lib"],
     shard_count = 4,
     tags = [
+        "no_oss",  # TODO(b/132387200): Segfaulting
         "optonly",
     ],
 )
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
index e217f2e1a44..9dda04f3929 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
@@ -42,7 +42,7 @@ limitations under the License.
 #if GOOGLE_CUDA
 #include "google/protobuf/duration.pb.h"
 #include "absl/time/time.h"
-#include "cuda/include/cudnn.h"
+#include "third_party/gpus/cudnn/cudnn.h"
 #include "tensorflow/core/kernels/conv_ops_gpu.h"
 #include "tensorflow/core/platform/logger.h"
 #include "tensorflow/core/platform/stream_executor.h"
@@ -221,20 +221,20 @@ class LaunchFusedConv2DBiasActivationOp<CPUDevice, qint8, BiasType, ScaleType> {
 
         auto conv_output_scaled =
             conv_output.cast<ScaleType>() * conv_input_scale;
-        auto side_input_scaled =
-            side_input.cast<ScaleType>() * side_input_scale;
-
-        if (activation_mode == ActivationMode::NONE) {
-          output = (conv_output_scaled + bias + side_input_scaled)
+        ScaleType lower_bound = (activation_mode == ActivationMode::NONE
+                                     ? static_cast<ScaleType>(kMinRange)
+                                     : 0);
+        if (side_input_scale == 0.0f) {
+          output = (conv_output_scaled + bias)
                        .round()
-                       .clip(static_cast<ScaleType>(kMinRange),
-                             static_cast<ScaleType>(kMaxRange))
+                       .clip(lower_bound, static_cast<ScaleType>(kMaxRange))
                        .template cast<T>();
-
-        } else if (activation_mode == ActivationMode::RELU) {
+        } else {
+          auto side_input_scaled =
+              side_input.cast<ScaleType>() * side_input_scale;
           output = (conv_output_scaled + bias + side_input_scaled)
                        .round()
-                       .clip(0, static_cast<ScaleType>(kMaxRange))
+                       .clip(lower_bound, static_cast<ScaleType>(kMaxRange))
                        .template cast<T>();
         }
       }
@@ -518,9 +518,8 @@ tensorflow::ComputeCapability GetComputeCapability(
   return cc;
 }
 
-void LogFusedConvAutotuneResults(
-    se::dnn::ConvolutionKind kind, se::dnn::DataType element_type,
-    const se::dnn::BatchDescriptor& input_desc,
+void LogFusedConvForwardAutotuneResults(
+    se::dnn::DataType element_type, const se::dnn::BatchDescriptor& input_desc,
     const se::dnn::FilterDescriptor& filter_desc,
     const se::dnn::BatchDescriptor& output_desc,
     const se::dnn::ConvolutionDescriptor& conv_desc, double conv_scale,
@@ -529,7 +528,7 @@ void LogFusedConvAutotuneResults(
   AutotuningLog log;
   {
     ConvolutionProto instr;
-    instr.set_kind(kind);
+    instr.set_kind(se::dnn::ConvolutionKind::FORWARD_BIAS_ACTIVATION);
     *instr.mutable_input() = input_desc.ToProto(element_type);
     *instr.mutable_filter() = filter_desc.ToProto(element_type);
     *instr.mutable_output() = output_desc.ToProto(element_type);
@@ -936,8 +935,7 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
             absl::Milliseconds(profile_result.elapsed_time_in_ms()));
       }
     }
-    internal::LogFusedConvAutotuneResults(
-        se::dnn::ConvolutionKind::FORWARD,
+    internal::LogFusedConvForwardAutotuneResults(
         se::dnn::ToDataType<typename RawType<T>::type>::value, conv_input_desc,
         filter_desc, output_desc, conv_desc, conv_input_scale, side_input_scale,
         dnn_activation_mode, stream->parent(), results);
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test_base.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test_base.py
index b4a2faacba8..0d796968c11 100644
--- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test_base.py
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test_base.py
@@ -950,7 +950,7 @@ class FusedConvInt8CPUTests(object):
           padding=padding_type,
           conv_input_scale=conv_input_scale,
           side_input_scale=side_input_scale,
-          side_input=side_input,
+          side_input=(None if side_input_scale == 0.0 else side_input),
           activation_mode="Relu" if apply_relu else "None",
           data_format="NHWC",
           filter_format="HWIO")
@@ -1045,7 +1045,8 @@ class FusedConvInt8CorrespondenceTests(object):
             padding=padding_type,
             conv_input_scale=conv_input_scale,
             side_input_scale=side_input_scale,
-            side_input=_Int8Roundtrip(_NchwVectCToNhwc, side_input),
+            side_input=(None if side_input_scale == 0.0 else _Int8Roundtrip(
+                _NchwVectCToNhwc, side_input)),
             activation_mode="Relu" if apply_relu else "None",
             data_format="NHWC",
             filter_format="HWIO")
@@ -1060,7 +1061,7 @@ class FusedConvInt8CorrespondenceTests(object):
             padding=padding_type,
             conv_input_scale=conv_input_scale,
             side_input_scale=side_input_scale,
-            side_input=side_input,
+            side_input=(None if side_input_scale == 0.0 else side_input),
             activation_mode="Relu" if apply_relu else "None",
             data_format="NCHW_VECT_C",
             filter_format="OIHW_VECT_I")
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
index efbdb1152d6..2c301267900 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
@@ -189,7 +189,7 @@ def _kl_divergence(p, p_logits, q):
 
 def get_graph_def_from_disk(filename):
   """Get a GraphDef proto from a disk location."""
-  with gfile.FastGFile(filename, 'rb') as f:
+  with gfile.GFile(filename, 'rb') as f:
     return graph_pb2.GraphDef.FromString(f.read())
 
 
diff --git a/tensorflow/contrib/model_pruning/BUILD b/tensorflow/contrib/model_pruning/BUILD
index fa7eb9fe309..ce77143e0c3 100644
--- a/tensorflow/contrib/model_pruning/BUILD
+++ b/tensorflow/contrib/model_pruning/BUILD
@@ -156,6 +156,9 @@ py_test(
     srcs = ["python/strip_pruning_vars_test.py"],
     python_version = "PY2",
     srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",  # b/132443370
+    ],
     deps = [
         ":layers",
         ":pruning",
diff --git a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
index b469ebff25f..1c8cdc5a42f 100644
--- a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
+++ b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
@@ -143,11 +143,6 @@ class CheckpointingTests(test.TestCase):
     suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
     expected_checkpoint_names = [
         name + suffix for name in expected_checkpoint_names]
-    # The optimizer and Dense layers also save get_config() JSON
-    expected_checkpoint_names.extend([
-        "model/_second/.ATTRIBUTES/OBJECT_CONFIG_JSON",
-        "model/_named_dense/.ATTRIBUTES/OBJECT_CONFIG_JSON"
-    ])
     named_variables = {v.name: v for v in named_variables}
     six.assertCountEqual(self, expected_checkpoint_names,
                          named_variables.keys())
diff --git a/tensorflow/contrib/session_bundle/exporter_test.py b/tensorflow/contrib/session_bundle/exporter_test.py
index 68419ffea04..33f10a47c59 100644
--- a/tensorflow/contrib/session_bundle/exporter_test.py
+++ b/tensorflow/contrib/session_bundle/exporter_test.py
@@ -88,12 +88,12 @@ class SaveRestoreShardedTest(test.TestCase):
       asset_file = constant_op.constant(asset_filepath_orig, name="filename42")
       ops.add_to_collection(ops.GraphKeys.ASSET_FILEPATHS, asset_file)
 
-      with gfile.FastGFile(asset_filepath_orig, "w") as f:
+      with gfile.GFile(asset_filepath_orig, "w") as f:
         f.write("your data here")
       assets_collection = ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)
 
       ignored_asset = os.path.join(test.get_temp_dir(), "ignored.txt")
-      with gfile.FastGFile(ignored_asset, "w") as f:
+      with gfile.GFile(ignored_asset, "w") as f:
         f.write("additional data here")
 
       variables.global_variables_initializer().run()
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc
index 65f99a2eea7..db7dd6b9587 100644
--- a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include <vector>
 
 #define EIGEN_USE_GPU
-#include "cuda/include/cuda_runtime_api.h"
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/util/gpu_launch_config.h"
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h
index c35955e1057..0d4893cd5d6 100644
--- a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "cuda/include/cuda_runtime_api.h"
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 
 namespace tensorflow {
 namespace tensorrt {
diff --git a/tensorflow/contrib/tpu/python/tpu/_tpu_estimator_embedding.py b/tensorflow/contrib/tpu/python/tpu/_tpu_estimator_embedding.py
index 41aa4d26781..d85aae64871 100644
--- a/tensorflow/contrib/tpu/python/tpu/_tpu_estimator_embedding.py
+++ b/tensorflow/contrib/tpu/python/tpu/_tpu_estimator_embedding.py
@@ -19,5 +19,5 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import,unused-import
-from tensorflow.python.tpu._tpu_estimator_embedding import *
+from tensorflow_estimator.python.estimator.tpu._tpu_estimator_embedding import *
 # pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/error_handling.py b/tensorflow/contrib/tpu/python/tpu/error_handling.py
index 1b1328b4075..9cbb5084a54 100644
--- a/tensorflow/contrib/tpu/python/tpu/error_handling.py
+++ b/tensorflow/contrib/tpu/python/tpu/error_handling.py
@@ -19,5 +19,5 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import,unused-import
-from tensorflow.python.tpu.error_handling import *
+from tensorflow_estimator.python.estimator.tpu.error_handling import *
 # pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config.py b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
index c36aaa38c0e..2c9bce0bca2 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_config.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
@@ -19,5 +19,5 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import,unused-import
-from tensorflow.python.tpu.tpu_config import *
+from tensorflow_estimator.python.estimator.tpu.tpu_config import *
 # pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_context.py b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
index b77b010cba6..573f49b2b9b 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_context.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
@@ -19,5 +19,5 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import,unused-import
-from tensorflow.python.tpu.tpu_context import *
+from tensorflow_estimator.python.estimator.tpu.tpu_context import *
 # pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 893118412e1..0ee490681e4 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -19,15 +19,15 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import,unused-import,redefined-builtin
-from tensorflow.python.tpu.tpu_estimator import *
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import *
 # used by tests
-from tensorflow.python.tpu.tpu_estimator import _clone_export_output_with_tensors
-from tensorflow.python.tpu.tpu_estimator import _create_global_step
-from tensorflow.python.tpu.tpu_estimator import _export_output_to_tensors
-from tensorflow.python.tpu.tpu_estimator import _get_scaffold
-from tensorflow.python.tpu.tpu_estimator import _Inputs
-from tensorflow.python.tpu.tpu_estimator import _ITERATIONS_PER_LOOP_VAR
-from tensorflow.python.tpu.tpu_estimator import _TPU_ENQUEUE_OPS
-from tensorflow.python.tpu.tpu_estimator import _TPU_ESTIMATOR
-from tensorflow.python.tpu.tpu_estimator import _TPU_TRAIN_OP
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _clone_export_output_with_tensors
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _create_global_step
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _export_output_to_tensors
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _get_scaffold
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _Inputs
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _ITERATIONS_PER_LOOP_VAR
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _TPU_ENQUEUE_OPS
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _TPU_ESTIMATOR
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _TPU_TRAIN_OP
 # pylint: enable=wildcard-import,unused-import,redefined-builtin
diff --git a/tensorflow/contrib/tpu/python/tpu/util.py b/tensorflow/contrib/tpu/python/tpu/util.py
index 8d9b70d46eb..6e0da240466 100644
--- a/tensorflow/contrib/tpu/python/tpu/util.py
+++ b/tensorflow/contrib/tpu/python/tpu/util.py
@@ -19,5 +19,5 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import,unused-import
-from tensorflow.python.tpu.util import *
+from tensorflow_estimator.python.estimator.tpu.util import *
 # pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/verbs/grpc_verbs_service.cc b/tensorflow/contrib/verbs/grpc_verbs_service.cc
index af29abd91fe..0f92ed3fe78 100644
--- a/tensorflow/contrib/verbs/grpc_verbs_service.cc
+++ b/tensorflow/contrib/verbs/grpc_verbs_service.cc
@@ -15,11 +15,8 @@ limitations under the License.
 
 #ifdef TENSORFLOW_USE_VERBS
 
-#include "grpcpp/alarm.h"
-#include "grpcpp/grpcpp.h"
-#include "grpcpp/server_builder.h"
-
 #include "tensorflow/contrib/verbs/grpc_verbs_service.h"
+
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
 
diff --git a/tensorflow/contrib/verbs/grpc_verbs_service.h b/tensorflow/contrib/verbs/grpc_verbs_service.h
index e616778665a..97da84e3128 100644
--- a/tensorflow/contrib/verbs/grpc_verbs_service.h
+++ b/tensorflow/contrib/verbs/grpc_verbs_service.h
@@ -18,6 +18,9 @@ limitations under the License.
 
 #ifdef TENSORFLOW_USE_VERBS
 
+#include "grpcpp/alarm.h"
+#include "grpcpp/grpcpp.h"
+#include "grpcpp/server_builder.h"
 #include "tensorflow/contrib/verbs/grpc_verbs_service_impl.h"
 #include "tensorflow/contrib/verbs/rdma_mgr.h"
 #include "tensorflow/contrib/verbs/verbs_service.pb.h"
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index c9386cee0cd..553d1901a9e 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -169,7 +169,6 @@ load(
     "tf_cuda_tests_tags",
 )
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
 load("@io_bazel_rules_closure//closure:defs.bzl", "closure_proto_library")
 load(
     "//third_party/mkl:build_defs.bzl",
@@ -911,6 +910,7 @@ tf_cuda_library(
         "framework/kernel_def_builder.h",
         "framework/kernel_def_util.h",
         "framework/log_memory.h",
+        "framework/logging.h",
         "framework/lookup_interface.h",
         "framework/memory_types.h",
         "framework/node_def_builder.h",
@@ -943,6 +943,7 @@ tf_cuda_library(
         "framework/tracking_allocator.h",
         "framework/type_index.h",
         "framework/type_traits.h",
+        "framework/typed_allocator.h",
         "framework/types.h",
         "public/version.h",
         "util/activation_mode.h",
@@ -993,6 +994,32 @@ tf_cuda_library(
     ],
 )
 
+# This is redundant with the "framework" target above. It's useful for
+# applications that want to depend on a minimal subset of TensorFlow (e.g. XLA).
+cc_library(
+    name = "allocator",
+    srcs = [
+        "framework/allocator.cc",
+        "framework/allocator_registry.cc",
+        "framework/allocator_registry.h",
+        "framework/numeric_types.h",
+        "framework/tracking_allocator.cc",
+        "framework/tracking_allocator.h",
+        "framework/type_traits.h",
+    ],
+    hdrs = [
+        "framework/allocator.h",
+    ],
+    features = ["parse_headers"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":lib",
+        "//third_party/eigen3",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
 cc_library(
     name = "stats_calculator_portable",
     srcs = [
@@ -1176,7 +1203,10 @@ tf_gen_op_libs(
     op_lib_names = [
         "array_ops",
     ],
-    deps = [":protos_all_cc"],
+    deps = [
+        ":lib",
+        ":protos_all_cc",
+    ],
 )
 
 tf_gen_op_libs(
@@ -1741,6 +1771,10 @@ filegroup(
             "util/reporter.*",
             "platform/**/cuda_libdevice_path.*",
             "platform/**/logger.cc",
+            # Exclude env_time and logging to avoid collisions with
+            # :platform_base, a common dependency for downstream targets.
+            "platform/**/env_time.cc",
+            "platform/**/logging.cc",
             "platform/default/test_benchmark.*",
             "platform/cuda.h",
             "platform/rocm.h",
@@ -1872,6 +1906,7 @@ cc_library(
 cc_library(
     name = "mobile_additional_lib_deps",
     deps = tf_additional_lib_deps() + [
+        ":platform_base",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
@@ -1883,7 +1918,6 @@ cc_library(
     srcs = if_emscripten([":mobile_srcs_no_runtime"]),
     copts = ["-DSUPPORT_SELECTIVE_REGISTRATION"] + tf_opts_nortti_if_emscripten(),
     defines = ["TENSORFLOW_LITE_PROTOS"],
-    linkopts = ["-lz"],
     tags = [
         "manual",
         "notap",
@@ -1895,6 +1929,7 @@ cc_library(
         ":stats_calculator_portable",
         "//third_party/eigen3",
         "@double_conversion//:double-conversion",
+        "@farmhash_archive//:farmhash",
         "@nsync//:nsync_cpp",
         "@zlib_archive//:zlib",
     ],
@@ -1937,6 +1972,7 @@ cc_library(
         ":stats_calculator_portable",
         "//third_party/eigen3",
         "@double_conversion//:double-conversion",
+        "@farmhash_archive//:farmhash",
         "@nsync//:nsync_cpp",
         "@protobuf_archive//:protobuf",
     ],
@@ -2835,6 +2871,9 @@ tf_cuda_library(
         exclude = [
             "**/*test*",
             "**/*main.cc",
+            "framework/allocator.cc",
+            "framework/allocator_registry.cc",
+            "framework/tracking_allocator.cc",
             "example/example_parser_configuration.*",
             "example/feature_util.cc",
             "util/reporter.cc",
@@ -2866,6 +2905,7 @@ tf_cuda_library(
         ],
     }),
     deps = [
+        ":allocator",
         ":feature_util",
         ":lib",
         ":lib_internal",
@@ -3264,6 +3304,36 @@ tf_cuda_library(
     alwayslink = 1,
 )
 
+# This is redundant with the "core_cpu_*" targets above. It's useful for
+# applications that want to depend on a minimal subset of TensorFlow (e.g. XLA).
+cc_library(
+    name = "bfc_allocator",
+    srcs = [
+        "common_runtime/allocator_retry.cc",
+        "common_runtime/allocator_retry.h",
+        "common_runtime/bfc_allocator.cc",
+    ],
+    hdrs = ["common_runtime/bfc_allocator.h"],
+    features = ["parse_headers"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":allocator",
+        ":lib",
+        ":lib_internal",
+        ":shared_counter",
+    ],
+)
+
+cc_library(
+    name = "shared_counter",
+    hdrs = ["common_runtime/shared_counter.h"],
+    features = ["parse_headers"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":lib",
+    ],
+)
+
 cc_library(
     name = "regexp_internal",
     hdrs = [
@@ -3326,7 +3396,7 @@ tf_cuda_library(
     name = "device_tracer",
     srcs = tf_additional_device_tracer_srcs(),
     copts = tf_copts(),
-    cuda_deps = if_cuda_is_configured(tf_additional_cupti_wrapper_deps() + tf_additional_device_tracer_cuda_deps()),
+    cuda_deps = tf_additional_cupti_wrapper_deps() + tf_additional_device_tracer_cuda_deps(),
     visibility = [
         "//tensorflow:internal",
     ],
@@ -3444,6 +3514,26 @@ tf_cuda_library(
     ] + if_static([":gpu_runtime_impl"]),
 )
 
+# This is redundant with the "gpu_runtime_*" targets above. It's useful for
+# applications that want to depend on a minimal subset of TensorFlow (e.g. XLA).
+tf_cuda_library(
+    name = "gpu_bfc_allocator",
+    srcs = [
+        "common_runtime/gpu/gpu_bfc_allocator.cc",
+        "common_runtime/gpu/gpu_id.h",
+    ],
+    hdrs = ["common_runtime/gpu/gpu_bfc_allocator.h"],
+    features = ["parse_headers"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":bfc_allocator",
+        ":lib",
+        ":lib_internal",
+        ":protos_all_cc",
+        ":stream_executor",
+    ],
+)
+
 tf_cuda_library(
     name = "gpu_init",
     hdrs = [
diff --git a/tensorflow/core/api_def/base_api/api_def_Fingerprint.pbtxt b/tensorflow/core/api_def/base_api/api_def_Fingerprint.pbtxt
new file mode 100644
index 00000000000..bf56a016277
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Fingerprint.pbtxt
@@ -0,0 +1,66 @@
+op {
+  graph_op_name: "Fingerprint"
+  endpoint {
+    name: "Fingerprint"
+  }
+  in_arg {
+    name: "data"
+    description: <<END
+Must have rank 1 or higher.
+END
+  }
+  in_arg {
+    name: "method"
+    description: <<END
+Fingerprint method used by this op. Currently available method is
+`farmhash::fingerprint64`.
+END
+  }
+  out_arg {
+    name: "fingerprint"
+    description: <<END
+A two-dimensional `Tensor` of type `tf.uint8`. The first dimension equals to
+`data`'s first dimension, and the second dimension size depends on the
+fingerprint algorithm.
+END
+  }
+  attr {
+    name: "T"
+    description: <<END
+This can be a POD-type or string type.
+END
+  }
+  summary: "Generates fingerprint values."
+  description: <<END
+Generates fingerprint values of `data`.
+
+Fingerprint op considers the first dimension of `data` as the batch dimension,
+and `output[i]` contains the fingerprint value generated from contents in
+`data[i, ...]` for all `i`.
+
+Fingerprint op writes fingerprint values as byte arrays. For example, the
+default method `farmhash64` generates a 64-bit fingerprint value at a time.
+This 8-byte value is written out as an `uint8` array of size 8, in little-endian
+order.
+
+For example, suppose that `data` has data type `DT_INT32` and shape (2, 3, 4),
+and that the fingerprint method is `farmhash64`. In this case, the output shape
+is (2, 8), where 2 is the batch dimension size of `data`, and 8 is the size of
+each fingerprint value in bytes. `output[0, :]` is generated from 12 integers in
+`data[0, :, :]` and similarly `output[1, :]` is generated from other 12 integers
+in `data[1, :, :]`.
+
+Note that this op fingerprints the raw underlying buffer, and it does not
+fingerprint Tensor's metadata such as data type and/or shape. For example, the
+fingerprint values are invariant under reshapes and bitcasts as long as the
+batch dimension remain the same:
+
+```
+Fingerprint(data) == Fingerprint(Reshape(data, ...))
+Fingerprint(data) == Fingerprint(Bitcast(data, ...))
+```
+
+For string data, one should expect `Fingerprint(data) !=
+Fingerprint(ReduceJoin(data))` in general.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyFtrl.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyFtrl.pbtxt
index f75272a63b1..b52e7f6ba6d 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyFtrl.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyFtrl.pbtxt
@@ -66,7 +66,7 @@ END
   description: <<END
 That is for rows we have grad for, we update var, accum and linear as follows:
 accum_new = accum + grad * grad
-linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
 quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
 var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
 accum = accum_new
diff --git a/tensorflow/core/api_def/base_api/api_def_StringToHashBucketStrong.pbtxt b/tensorflow/core/api_def/base_api/api_def_StringToHashBucketStrong.pbtxt
index b63fbd1ff9d..e9764e94c32 100644
--- a/tensorflow/core/api_def/base_api/api_def_StringToHashBucketStrong.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_StringToHashBucketStrong.pbtxt
@@ -21,7 +21,7 @@ END
   attr {
     name: "key"
     description: <<END
-The key for the keyed hash function passed as a list of two uint64
+The key used to seed the hash function, passed as a list of two uint64
 elements.
 END
   }
@@ -34,8 +34,11 @@ defines the key of the hash function. `key` is an array of 2 elements.
 A strong hash is important when inputs may be malicious, e.g. URLs with
 additional components. Adversaries could try to make their inputs hash to the
 same bucket for a denial-of-service attack or to skew the results. A strong
-hash prevents this by making it difficult, if not infeasible, to compute inputs
-that hash to the same bucket. This comes at a cost of roughly 4x higher compute
+hash can be used to make it difficult to find inputs with a skewed hash value
+distribution over buckets. This requires that the hash function is
+seeded by a high-entropy (random) "key" unknown to the adversary.
+
+The additional robustness comes at a cost of roughly 4x higher compute
 time than `tf.string_to_hash_bucket_fast`.
 END
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Fingerprint.pbtxt b/tensorflow/core/api_def/python_api/api_def_Fingerprint.pbtxt
new file mode 100644
index 00000000000..07f66fc173e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Fingerprint.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Fingerprint"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index ec649848e67..62461cf7fae 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <atomic>
 
 #include "tensorflow/core/common_runtime/allocator_retry.h"
-#include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/strings/numbers.h"
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index ea385e6a174..bfd857a5e1b 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -32,7 +32,6 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 37e9e5afa5d..9361521b807 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/framework/function.h"
@@ -1261,6 +1262,11 @@ Status DirectSession::CreateExecutors(
       if (kernel && !OpSegment::ShouldOwnKernel(lib, kernel->type_string()))
         delete kernel;
     };
+    params.rendezvous_factory = [](const int64, const DeviceMgr* device_mgr,
+                                   Rendezvous** r) {
+      *r = new IntraProcessRendezvous(device_mgr);
+      return Status::OK();
+    };
 
     optimizer.Optimize(lib, options_.env, device, &partition_graph,
                        /*shape_map=*/nullptr);
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index 68e035f3cde..14e1486545d 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -50,8 +50,8 @@ limitations under the License.
 #include "tensorflow/core/util/device_name_utils.h"
 
 #ifdef GOOGLE_CUDA
-#include "cuda/include/cuda.h"
-#include "cuda/include/cuda_runtime_api.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #endif  // GOOGLE_CUDA
 
 namespace tensorflow {
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 38fd8c32cd7..0a4684393d7 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/logging.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -566,10 +567,12 @@ Status EagerLocalExecute(EagerOperation* op,
     }
     const string& device_name =
         device == nullptr ? unspecified_device_name : device->name();
-    if (ctx->LogDevicePlacement()) {
-      LOG(INFO) << "Executing op " << ndef.op() << " in device " << device_name;
-    } else {
-      VLOG(1) << "Executing op " << ndef.op() << " in device " << device_name;
+    if (ctx->LogDevicePlacement() || VLOG_IS_ON(1)) {
+      string msg = strings::StrCat("Executing op ", ndef.op(), " in device ",
+                                   device_name);
+      if (!logging::LogToListeners(msg)) {
+        LOG(INFO) << msg;
+      }
     }
 
     FunctionLibraryRuntime* flr =
@@ -1061,9 +1064,12 @@ Status EagerExecute(EagerOperation* op,
     return EagerLocalExecute(op, retvals, num_retvals);
   }
 
-  if (op->EagerContext()->LogDevicePlacement()) {
-    LOG(INFO) << "Executing op " << op->Name() << " in device "
-              << op->Device()->name();
+  if (op->EagerContext()->LogDevicePlacement() || VLOG_IS_ON(1)) {
+    string msg = strings::StrCat("Executing op ", op->Name(), " in device ",
+                                 op->Device()->name());
+    if (!logging::LogToListeners(msg)) {
+      LOG(INFO) << msg;
+    }
   }
 
   return EagerRemoteExecute(op, retvals->data(), num_retvals);
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 4a117fae6a5..d2e599b04c1 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -158,14 +158,14 @@ struct NodeItem {
   // The kernel for this node.
   OpKernel* kernel = nullptr;
 
-  bool kernel_is_async : 1;      // True iff kernel->AsAsync() != nullptr
-  bool is_merge : 1;             // True iff IsMerge(node)
-  bool is_enter : 1;             // True iff IsEnter(node)
-  bool is_constant_enter : 1;    // True iff IsEnter(node) and
-                                 // node->GetAttr("is_constant") == true.
-  bool is_exit : 1;              // True iff IsExit(node)
-  bool is_control_trigger : 1;   // True iff IsControlTrigger(node)
-  bool is_sink : 1;              // True iff IsSink(node)
+  bool kernel_is_async : 1;     // True iff kernel->AsAsync() != nullptr
+  bool is_merge : 1;            // True iff IsMerge(node)
+  bool is_enter : 1;            // True iff IsEnter(node)
+  bool is_constant_enter : 1;   // True iff IsEnter(node) and
+                                // node->GetAttr("is_constant") == true.
+  bool is_exit : 1;             // True iff IsExit(node)
+  bool is_control_trigger : 1;  // True iff IsControlTrigger(node)
+  bool is_sink : 1;             // True iff IsSink(node)
   // True iff IsEnter(node) || IsExit(node) || IsNextIteration(node)
   bool is_enter_exit_or_next_iter : 1;
 
@@ -1249,6 +1249,7 @@ class ExecutorState {
   int64 step_id_;
   // Not owned.
   Rendezvous* rendezvous_;
+  Executor::RendezvousFactory* create_rendezvous_ = nullptr;
   CollectiveExecutor* collective_executor_ = nullptr;
   SessionState* session_state_;
   string session_handle_;
@@ -1383,6 +1384,7 @@ ExecutorState::ExecutorState(const Executor::Args& args, ExecutorImpl* impl)
       log_memory_(LogMemory::IsEnabled()),
       step_id_(args.step_id),
       rendezvous_(args.rendezvous),
+      create_rendezvous_(&impl->params_.rendezvous_factory),
       collective_executor_(args.collective_executor),
       session_state_(args.session_state),
       session_handle_(args.session_handle),
@@ -1627,6 +1629,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
   params.log_memory = log_memory_;
   params.record_tensor_accesses = impl_->device_record_tensor_accesses_;
   params.rendezvous = rendezvous_;
+  params.create_rendezvous = create_rendezvous_;
   params.collective_executor = collective_executor_;
   params.session_state = session_state_;
   params.session_handle = session_handle_;
@@ -2231,14 +2234,10 @@ bool ExecutorState::NodeDone(const Status& s, const Node* node,
     if (cancellation_manager_) {
       // only log when the abort happens during the actual run time.
       auto device_name = impl_->params_.device->name();
-      // Do not log OutOfRange errors as warnings because they are expected when
+      // Use VLOG instead of LOG(warning) because error status is expected when
+      // the executor is run under the grappler optimization phase or when
       // iterating through a tf.data input pipeline.
-      if (!errors::IsOutOfRange(s)) {
-        LOG(WARNING) << "[" << device_name
-                     << "] Executor start aborting: " << s;
-      } else {
-        VLOG(1) << "[" << device_name << "] Executor start aborting: " << s;
-      }
+      VLOG(1) << "[" << device_name << "] Executor start aborting: " << s;
     }
 
     if (rendezvous_) {
diff --git a/tensorflow/core/common_runtime/executor.h b/tensorflow/core/common_runtime/executor.h
index bcb9fd6212a..ff64201c80f 100644
--- a/tensorflow/core/common_runtime/executor.h
+++ b/tensorflow/core/common_runtime/executor.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_COMMON_RUNTIME_EXECUTOR_H_
 
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/session_state.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -81,6 +82,9 @@ class Executor {
   //
   // RunAsync() dispatches closures to "runner". Typically, "runner"
   // is backed up by a bounded threadpool.
+  typedef std::function<Status(const int64, const DeviceMgr*, Rendezvous** r)>
+      RendezvousFactory;
+
   struct Args {
     int64 step_id = 0;
     Rendezvous* rendezvous = nullptr;
@@ -135,6 +139,8 @@ struct LocalExecutorParams {
   // when the executor is deleted.
   std::function<Status(const NodeDef&, OpKernel**)> create_kernel;
   std::function<void(OpKernel*)> delete_kernel;
+
+  Executor::RendezvousFactory rendezvous_factory;
 };
 ::tensorflow::Status NewLocalExecutor(const LocalExecutorParams& params,
                                       std::unique_ptr<const Graph> graph,
diff --git a/tensorflow/core/common_runtime/executor_test.cc b/tensorflow/core/common_runtime/executor_test.cc
index c311b2533ea..57019b08acf 100644
--- a/tensorflow/core/common_runtime/executor_test.cc
+++ b/tensorflow/core/common_runtime/executor_test.cc
@@ -13,11 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/common_runtime/executor.h"
+
 #include <algorithm>
 
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
-#include "tensorflow/core/common_runtime/executor.h"
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
@@ -68,10 +69,16 @@ class ExecutorTest : public ::testing::Test {
     params.delete_kernel = [](OpKernel* kernel) {
       DeleteNonCachedKernel(kernel);
     };
+    rendez_ = NewLocalRendezvous();
+    params.rendezvous_factory = [this](const int64, const DeviceMgr*,
+                                       Rendezvous** r) {
+      *r = rendez_;
+      rendez_->Ref();
+      return Status::OK();
+    };
     delete exec_;
     TF_CHECK_OK(NewLocalExecutor(params, std::move(graph), &exec_));
     runner_ = [this](std::function<void()> fn) { thread_pool_->Schedule(fn); };
-    rendez_ = NewLocalRendezvous();
   }
 
   Status Run(Rendezvous* rendez) {
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index bde8958ce89..822dad751c1 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -401,6 +401,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
     Executor* exec = nullptr;
     FunctionLibraryRuntimeOverlay* overlay_flr = nullptr;
     string executor_type;
+    Executor::RendezvousFactory rendezvous_factory = nullptr;
 
     ~Item() {
       delete this->func_graph;
@@ -498,7 +499,6 @@ class CallOp : public AsyncOpKernel {
                       errors::Internal("No function library is provided."),
                       done);
     FunctionLibraryRuntime::Options opts;
-    opts.step_id = ctx->step_id();
     opts.rendezvous = ctx->rendezvous();
     opts.cancellation_manager = ctx->cancellation_manager();
     opts.step_container = ctx->step_container();
@@ -773,6 +773,11 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
         item->overlay_flr =
             new FunctionLibraryRuntimeOverlay(this, options.lib_def);
       }
+      item->rendezvous_factory = [](const int64, const DeviceMgr* device_mgr,
+                                    Rendezvous** r) {
+        *r = new IntraProcessRendezvous(device_mgr);
+        return Status::OK();
+      };
       local_handle = next_handle_++;
       items_.emplace(local_handle, std::unique_ptr<Item>(item));
     }
@@ -924,6 +929,7 @@ Status FunctionLibraryRuntimeImpl::CreateItem(Item** item) {
   params.delete_kernel = [](OpKernel* kernel) {
     DeleteNonCachedKernel(kernel);
   };
+  params.rendezvous_factory = (*item)->rendezvous_factory;
   Graph* graph = g.get();
   std::unique_ptr<Executor> exec;
   TF_RETURN_IF_ERROR(NewExecutor(executor_type, params, std::move(g), &exec));
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index 47bd646126a..d6e3b6784ce 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -98,6 +98,11 @@ class FunctionTest : public ::testing::Test {
     params.delete_kernel = [](OpKernel* kernel) {
       DeleteNonCachedKernel(kernel);
     };
+    params.rendezvous_factory = [](const int64, const DeviceMgr* device_mgr,
+                                   Rendezvous** r) {
+      *r = new IntraProcessRendezvous(device_mgr);
+      return Status::OK();
+    };
     Executor* exec;
     TF_CHECK_OK(NewLocalExecutor(params, std::move(g), &exec));
     exec_.reset(exec);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
index 42021e51f34..06b97073929 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
@@ -16,8 +16,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
 
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
index d4c9cee89a9..fb4ca3ffdb7 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
@@ -21,7 +21,6 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/core/common_runtime/allocator_retry.h"
 #include "tensorflow/core/common_runtime/bfc_allocator.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/platform/stream_executor.h"
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
index a97874aef60..75d21d80dcb 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/core/framework/typed_allocator.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
@@ -153,18 +154,18 @@ TEST(GPUBFCAllocatorTest, ExerciseCoalescing) {
   GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
   CheckStats(&a, 0, 0, 0, 0);
 
-  float* first_ptr = a.Allocate<float>(1024);
+  float* first_ptr = TypedAllocator::Allocate<float>(&a, 1024, {});
   a.DeallocateRaw(first_ptr);
   CheckStats(&a, 1, 0, 4096, 4096);
   for (int i = 0; i < 1024; ++i) {
     // Allocate several buffers of different sizes, and then clean them
     // all up.  We should be able to repeat this endlessly without
     // causing fragmentation and growth.
-    float* t1 = a.Allocate<float>(1024);
+    float* t1 = TypedAllocator::Allocate<float>(&a, 1024, {});
 
-    int64* t2 = a.Allocate<int64>(1048576);
-    double* t3 = a.Allocate<double>(2048);
-    float* t4 = a.Allocate<float>(10485760);
+    int64* t2 = TypedAllocator::Allocate<int64>(&a, 1048576, {});
+    double* t3 = TypedAllocator::Allocate<double>(&a, 2048, {});
+    float* t4 = TypedAllocator::Allocate<float>(&a, 10485760, {});
 
     a.DeallocateRaw(t1);
     a.DeallocateRaw(t2);
@@ -179,7 +180,7 @@ TEST(GPUBFCAllocatorTest, ExerciseCoalescing) {
   // At the end, we should have coalesced all memory into one region
   // starting at the beginning, so validate that allocating a pointer
   // starts from this region.
-  float* first_ptr_after = a.Allocate<float>(1024);
+  float* first_ptr_after = TypedAllocator::Allocate<float>(&a, 1024, {});
   EXPECT_EQ(first_ptr, first_ptr_after);
   a.DeallocateRaw(first_ptr_after);
 }
@@ -190,7 +191,7 @@ TEST(GPUBFCAllocatorTest, AllocateZeroBufSize) {
       GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(),
       platform_gpu_id, false /*use_unified_memory*/, {}, {});
   GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
-  float* ptr = a.Allocate<float>(0);
+  float* ptr = TypedAllocator::Allocate<float>(&a, 0, {});
   EXPECT_EQ(nullptr, ptr);
 }
 
@@ -209,7 +210,7 @@ TEST(GPUBFCAllocatorTest, AllocatedVsRequested) {
       GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(),
       platform_gpu_id, false /*use_unified_memory*/, {}, {});
   GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
-  float* t1 = a.Allocate<float>(1);
+  float* t1 = TypedAllocator::Allocate<float>(&a, 1, {});
   EXPECT_EQ(4, a.RequestedSize(t1));
   EXPECT_EQ(256, a.AllocatedSize(t1));
   a.DeallocateRaw(t1);
@@ -223,8 +224,8 @@ TEST(GPUBFCAllocatorTest, TestCustomMemoryLimit) {
   // Configure a 1MiB byte limit
   GPUBFCAllocator a(sub_allocator, 1 << 20, "GPU_0_bfc");
 
-  float* first_ptr = a.Allocate<float>(1 << 6);
-  float* second_ptr = a.Allocate<float>(1 << 20);
+  float* first_ptr = TypedAllocator::Allocate<float>(&a, 1 << 6, {});
+  float* second_ptr = TypedAllocator::Allocate<float>(&a, 1 << 20, {});
 
   EXPECT_NE(nullptr, first_ptr);
   EXPECT_EQ(nullptr, second_ptr);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
index 623582698ad..ea12a663b2f 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #ifdef GOOGLE_CUDA
-#include "cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cuda.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
index 06b01fe9c6a..28a24658ad9 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/core/framework/typed_allocator.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor.h"
@@ -47,7 +48,8 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_None) {
   for (int s : {8}) {
     std::vector<int64> cpu_array(s);
     memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64));
-    int64* gpu_array = a.Allocate<int64>(cpu_array.size());
+    int64* gpu_array =
+        TypedAllocator::Allocate<int64>(&a, cpu_array.size(), {});
     se::DeviceMemory<int64> gpu_array_ptr{se::DeviceMemoryBase{gpu_array}};
     ASSERT_TRUE(stream_exec->SynchronousMemcpy(&gpu_array_ptr, &cpu_array[0],
                                                s * sizeof(int64)));
@@ -74,7 +76,8 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Header) {
 
           std::vector<int64> cpu_array(s);
           memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64));
-          int64* gpu_array = a.Allocate<int64>(cpu_array.size());
+          int64* gpu_array =
+              TypedAllocator::Allocate<int64>(&a, cpu_array.size(), {});
 
           se::DeviceMemory<int64> gpu_array_ptr{
               se::DeviceMemoryBase{gpu_array}};
@@ -110,7 +113,8 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) {
 
           std::vector<int64> cpu_array(s);
           memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64));
-          int64* gpu_array = a.Allocate<int64>(cpu_array.size());
+          int64* gpu_array =
+              TypedAllocator::Allocate<int64>(&a, cpu_array.size(), {});
 
           se::DeviceMemory<int64> gpu_array_ptr{
               se::DeviceMemoryBase{gpu_array}};
@@ -145,7 +149,7 @@ TEST(GPUDebugAllocatorTest, ResetToNan) {
   std::vector<float> cpu_array_result(1024);
 
   // Allocate 1024 floats
-  float* gpu_array = a.Allocate<float>(cpu_array.size());
+  float* gpu_array = TypedAllocator::Allocate<float>(&a, cpu_array.size(), {});
   se::DeviceMemory<float> gpu_array_ptr{se::DeviceMemoryBase{gpu_array}};
   ASSERT_TRUE(stream_exec->SynchronousMemcpy(&cpu_array[0], gpu_array_ptr,
                                              cpu_array.size() * sizeof(float)));
@@ -192,7 +196,7 @@ TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) {
   std::vector<float> cpu_array_result(1024);
 
   // Allocate 1024 floats
-  float* gpu_array = a.Allocate<float>(cpu_array.size());
+  float* gpu_array = TypedAllocator::Allocate<float>(&a, cpu_array.size(), {});
   se::DeviceMemory<float> gpu_array_ptr{se::DeviceMemoryBase{gpu_array}};
   ASSERT_TRUE(stream_exec->SynchronousMemcpy(&cpu_array[0], gpu_array_ptr,
                                              cpu_array.size() * sizeof(float)));
@@ -241,7 +245,7 @@ TEST(GPUDebugAllocatorTest, AllocatedVsRequested) {
       new GPUDebugAllocator(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
                             platform_gpu_id),
       platform_gpu_id);
-  float* t1 = a.Allocate<float>(1);
+  float* t1 = TypedAllocator::Allocate<float>(&a, 1, {});
   EXPECT_EQ(4, a.RequestedSize(t1));
   EXPECT_EQ(256, a.AllocatedSize(t1));
   a.DeallocateRaw(t1);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 6e455765b19..cc422038850 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -78,7 +78,7 @@ limitations under the License.
 
 #if !defined(PLATFORM_GOOGLE)
 #if GOOGLE_CUDA
-#include "cuda/cuda_config.h"
+#include "third_party/gpus/cuda/cuda_config.h"
 #endif
 #endif
 
@@ -375,7 +375,8 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
         streams_.back()->device_to_host, streams_.back()->device_to_device));
   }
 
-  em_.reset(new EventMgr(executor_, options.config.gpu_options()));
+  em_ = EventMgrFactory::Singleton()->GetEventMgr(executor_,
+                                                  options.config.gpu_options());
 
   GPUKernelTracker::Params tracker_params(
       options.config.gpu_options().experimental().kernel_tracker_max_interval(),
@@ -404,13 +405,13 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
     }
     kernel_tracker_.reset(new GPUKernelTracker(
         tracker_params, Env::Default(), streams_[0]->compute, timing_counter,
-        timestamped_allocator_ ? gpu_allocator_ : nullptr, em_.get()));
+        timestamped_allocator_ ? gpu_allocator_ : nullptr, em_));
   }
 
   gpu_device_info_ = new GpuDeviceInfo;
   gpu_device_info_->stream = streams_[0]->compute;
   gpu_device_info_->default_context = device_contexts_[0];
-  gpu_device_info_->event_mgr = em_.get();
+  gpu_device_info_->event_mgr = em_;
   PlatformGpuId platform_gpu_id;
   TF_RETURN_IF_ERROR(
       GpuIdManager::TfToPlatformGpuId(tf_gpu_id_, &platform_gpu_id));
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index 3d824a87cba..4a6251230d8 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -157,7 +157,7 @@ class BaseGPUDevice : public LocalDevice {
   TfGpuId tf_gpu_id_;
   const bool sync_every_op_ = false;
   const int32 max_streams_;
-  std::unique_ptr<EventMgr> em_;
+  EventMgr* em_ = nullptr;
   std::unique_ptr<thread::ThreadPool> thread_pool_;
   std::unique_ptr<GPUKernelTracker> kernel_tracker_;
   int32 pending_cap_ = 0;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
index 6531d6d367b..27cfe9b9799 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
@@ -295,4 +295,26 @@ void EventMgr::PollEvents(bool is_dedicated_poller,
   }
 }
 
+EventMgrFactory* EventMgrFactory::Singleton() {
+  static EventMgrFactory* instance = new EventMgrFactory;
+  return instance;
+}
+
+EventMgr* EventMgrFactory::GetEventMgr(se::StreamExecutor* se,
+                                       const GPUOptions& gpu_options) {
+  mutex_lock l(mu_);
+  // TODO(laigd): consider making gpu_options part of the key. It's not
+  // currently since EventMgr depends only rely on field deferred_deletion_bytes
+  // and polling_active_delay_usecs from gpu_options which are not used or
+  // rarely used.
+  auto itr = event_mgr_map_.find(se);
+  if (itr == event_mgr_map_.end()) {
+    auto event_mgr = new EventMgr(se, gpu_options);
+    event_mgr_map_[se] = event_mgr;
+    return event_mgr;
+  } else {
+    return itr->second;
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
index 2d406b676e3..169a86a4986 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
@@ -63,9 +63,7 @@ void WarnIfInCallback(std::function<void()> f);
 // Events are recorded.
 class EventMgr {
  public:
-  EventMgr(se::StreamExecutor* se, const GPUOptions& gpu_options);
-
-  ~EventMgr();
+  virtual ~EventMgr();
 
   // Releases the references on the elements of "tensors" as soon as
   // all events currently enqueued on "stream" have completed.
@@ -107,7 +105,9 @@ class EventMgr {
   }
 
  private:
+  friend class TEST_EventMgr;
   friend class TEST_EventMgrHelper;
+  friend class EventMgrFactory;
   se::StreamExecutor* const exec_;
   const int64 deferred_bytes_threshold_;
   const int32 polling_active_delay_usecs_;
@@ -125,6 +125,8 @@ class EventMgr {
 
   typedef gtl::InlinedVector<InUse, 4> ToFreeVector;
 
+  EventMgr(se::StreamExecutor* se, const GPUOptions& gpu_options);
+
   void FreeMemory(const ToFreeVector& to_free) {
     for (const auto& iu : to_free) {
       if (iu.mem != nullptr) {
@@ -202,5 +204,20 @@ class EventMgr {
   thread::ThreadPool threadpool_;
 };
 
+// Manages all the EventMgr instances.
+class EventMgrFactory {
+ public:
+  static EventMgrFactory* Singleton();
+
+  EventMgr* GetEventMgr(se::StreamExecutor* se, const GPUOptions& gpu_options);
+
+ private:
+  mutex mu_;
+
+  // Maintain one EventMgr per physical device (StreamExecutor is
+  // per-physical-device).
+  std::map<se::StreamExecutor*, EventMgr*> event_mgr_map_ GUARDED_BY(mu_);
+};
+
 }  // namespace tensorflow
 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_EVENT_MGR_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
index 6177e61a1ca..43ac015000b 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
@@ -37,6 +37,13 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Subclass EventMgr to access its private constructor.
+class TEST_EventMgr : public EventMgr {
+ public:
+  TEST_EventMgr(se::StreamExecutor* se, const GPUOptions& gpu_options)
+      : EventMgr(se, gpu_options) {}
+};
+
 class TEST_EventMgrHelper {
  public:
   explicit TEST_EventMgrHelper(EventMgr* em) : em_(em) {
@@ -109,7 +116,7 @@ namespace {
 
 TEST(EventMgr, Empty) {
   auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
-  EventMgr em(stream_exec, GPUOptions());
+  TEST_EventMgr em(stream_exec, GPUOptions());
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, th.queue_size());
   EXPECT_EQ(0, th.free_size());
@@ -126,7 +133,7 @@ static void AddTensorReference(TensorReferenceVector* v, int64 size) {
 // the max simultaneously pending, we should not allocate any more.
 TEST(EventMgr, DelayedPolling) {
   auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
-  EventMgr em(stream_exec, GPUOptions());
+  TEST_EventMgr em(stream_exec, GPUOptions());
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, th.queue_size());
   TensorReferenceVector* v = nullptr;
@@ -159,7 +166,7 @@ TEST(EventMgr, DelayedPolling) {
 
 TEST(EventMgr, FlushLargeTensorImmediately) {
   auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
-  EventMgr em(stream_exec, GPUOptions());
+  TEST_EventMgr em(stream_exec, GPUOptions());
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, live_tensor_bytes);
   std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
@@ -176,7 +183,7 @@ TEST(EventMgr, FlushLargeTensorImmediately) {
 
 TEST(EventMgr, ManySmallTensorsFlushedImmediately) {
   auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
-  EventMgr em(stream_exec, GPUOptions());
+  TEST_EventMgr em(stream_exec, GPUOptions());
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, live_tensor_bytes);
   std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
@@ -195,7 +202,7 @@ TEST(EventMgr, ManySmallTensorsFlushedImmediately) {
 
 TEST(EventMgr, StreamSwitchingFlushesImmediately) {
   auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
-  EventMgr em(stream_exec, GPUOptions());
+  TEST_EventMgr em(stream_exec, GPUOptions());
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, live_tensor_bytes);
   std::unique_ptr<se::Stream> stream1(new se::Stream(stream_exec));
@@ -217,7 +224,7 @@ TEST(EventMgr, StreamSwitchingFlushesImmediately) {
 
 TEST(EventMgr, ManySmallTensorsSeparateCallsFlushed) {
   auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
-  EventMgr em(stream_exec, GPUOptions());
+  TEST_EventMgr em(stream_exec, GPUOptions());
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, live_tensor_bytes);
   std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
@@ -239,7 +246,7 @@ TEST(EventMgr, ManySmallTensorsSeparateCallsFlushed) {
 // down gracefully.
 TEST(EventMgr, NonEmptyShutdown) {
   auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
-  EventMgr em(stream_exec, GPUOptions());
+  TEST_EventMgr em(stream_exec, GPUOptions());
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, th.queue_size());
   EXPECT_EQ(0, th.free_size());
@@ -258,7 +265,7 @@ TEST(EventMgr, NonEmptyShutdown) {
 // Tests that WarnIfInCallback() triggers correctly.
 TEST(EventMgr, WarnIfInCallback) {
   auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
-  EventMgr em(stream_exec, GPUOptions());
+  TEST_EventMgr em(stream_exec, GPUOptions());
   TEST_EventMgrHelper th(&em);
   std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
   CHECK(stream);
@@ -299,7 +306,7 @@ class GPUDeviceTestHelper {
   se::Stream* h2d_stream() { return gpu_->streams_[0]->host_to_device; }
   se::Stream* d2h_stream() { return gpu_->streams_[0]->device_to_host; }
   se::Stream* d2d_stream() { return gpu_->streams_[0]->device_to_device[0]; }
-  EventMgr* event_mgr() { return gpu_->em_.get(); }
+  EventMgr* event_mgr() { return gpu_->em_; }
   int pending_cap() { return gpu_->pending_cap_; }
 
  private:
@@ -585,7 +592,7 @@ static void BM_no_ops(int iters, int threads) {
   std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
   CHECK(stream);
   stream->Init();
-  EventMgr em(stream_exec, GPUOptions());
+  TEST_EventMgr em(stream_exec, GPUOptions());
   testing::StartTiming();
   std::atomic<int> counter;
   counter.store(0, std::memory_order_seq_cst);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_managed_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_managed_allocator.cc
index aad42df5f1f..d0f68987a5c 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_managed_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_managed_allocator.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #ifdef GOOGLE_CUDA
-#include "cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cuda.h"
 #define EIGEN_USE_GPU
 #endif
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc
index 800c4acf70a..c0e8ac467f4 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc
@@ -151,7 +151,8 @@ void GPUUtil::SetProtoFromGPU(const Tensor& tensor, Device* dev,
   if (total_bytes > 0) {
     tracing::ScopedAnnotation annotation("SetProtoFromGPU");
     alloc = GPUProcessState::singleton()->GetGpuHostAllocator(0);
-    buf = alloc->Allocate<char>(total_bytes);
+    buf = static_cast<char*>(
+        alloc->AllocateRaw(Allocator::kAllocatorAlignment, total_bytes));
     if (LogMemory::IsEnabled()) {
       LogMemory::RecordRawAllocation("SetProtoFromGPU",
                                      LogMemory::PROTO_BUFFER_STEP_ID,
@@ -178,7 +179,7 @@ void GPUUtil::SetProtoFromGPU(const Tensor& tensor, Device* dev,
                                              LogMemory::PROTO_BUFFER_STEP_ID,
                                              buf, alloc, false);
           }
-          alloc->Deallocate<char>(buf, total_bytes);
+          alloc->DeallocateRaw(buf);
         }
         done(Status::OK());
       });
diff --git a/tensorflow/core/common_runtime/graph_runner.cc b/tensorflow/core/common_runtime/graph_runner.cc
index 13f4784ee7f..88cb238c003 100644
--- a/tensorflow/core/common_runtime/graph_runner.cc
+++ b/tensorflow/core/common_runtime/graph_runner.cc
@@ -164,6 +164,11 @@ Status GraphRunner::Run(Graph* graph, FunctionLibraryRuntime* function_library,
                                  kernel);
   };
   params.delete_kernel = [](OpKernel* kernel) { delete kernel; };
+  params.rendezvous_factory = [](const int64, const DeviceMgr* device_mgr,
+                                 Rendezvous** r) {
+    *r = new IntraProcessRendezvous(device_mgr);
+    return Status::OK();
+  };
 
   Executor* executor;
   TF_RETURN_IF_ERROR(
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index a24757f33fa..ccec6c0cc28 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -83,6 +83,15 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
         device_mgr, env, d, graph_def_version, lib_def_, default_thread_pool,
         optimizer_options, custom_kernel_creator, this);
   }
+
+  DeviceMgr const* all_devices = device_mgr_;
+  if (parent_ != nullptr && parent_->remote_device_mgr() != nullptr) {
+    all_devices = parent_->remote_device_mgr();
+  }
+
+  for (auto d : all_devices->ListDevices()) {
+    device_set_.AddDevice(d);
+  }
 }
 
 /* static */
@@ -592,16 +601,11 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
     options.graph_collector->CollectRawGraph(def);
   }
 
-  DeviceSet device_set;
-  for (auto d : device_mgr_->ListDevices()) {
-    device_set.AddDevice(d);
-  }
-
   TF_RETURN_IF_ERROR(SetArgShape(options.input_tensor_shapes,
                                  options.input_resource_dtypes_and_shapes,
                                  arg_nodes));
   TF_RETURN_IF_ERROR(PinArgsAndRets(options.input_devices,
-                                    options.output_devices, device_set,
+                                    options.output_devices, device_set_,
                                     arg_nodes, ret_nodes));
 
   std::unique_ptr<MultiDeviceFunctionData> data =
@@ -617,7 +621,7 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   optimization_options.session_options = &session_options;
   optimization_options.graph = &graph;
   optimization_options.flib_def = &data->lib_def_;
-  optimization_options.device_set = &device_set;
+  optimization_options.device_set = &device_set_;
 
   DumpGraph("Before running PRE_PLACEMENT passes", graph.get());
   TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
@@ -642,7 +646,7 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   // TODO(b/124993244): Smartly merge options in nested defuns, and raise
   // exceptions/warnings in case where nested function call options are ignored.
   Placer placer(graph.get(), function_name, optimization_options.flib_def,
-                &device_set, default_device,
+                &device_set_, default_device,
                 options.config_proto.allow_soft_placement(),
                 options.config_proto.log_device_placement());
   TF_RETURN_IF_ERROR(placer.Run());
@@ -658,7 +662,7 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
     DumpGraph("Before running graph optimization fn", graph.get());
     Status status = options.optimize_graph_fn(
         std::move(ret_node_names), std::move(control_ret_node_names),
-        &data->lib_def_, device_set, cpu_device, &graph);
+        &data->lib_def_, device_set_, cpu_device, &graph);
     if (!status.ok()) {
       LOG(WARNING) << "Ignoring multi-device function optimization failure: "
                    << status.ToString();
@@ -679,7 +683,7 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
 
   std::unordered_map<string, std::unique_ptr<Graph>> subgraphs;
   TF_RETURN_IF_ERROR(
-      PartitionFunctionGraph(device_set, std::move(graph), &subgraphs));
+      PartitionFunctionGraph(device_set_, std::move(graph), &subgraphs));
 
   for (const auto& pair : subgraphs) {
     DumpGraph(strings::StrCat("Before running POST_PARTITIONING passes (",
@@ -736,8 +740,6 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   FunctionNameGenerator name_generator(&data->lib_def_, function_name);
   for (const auto& pair : subgraphs) {
     i += 1;
-    // TODO(iga): Fail gracefully if the set of devices corresponds
-    // to more than one address space.
     const string& target = pair.first;
     FunctionLibraryRuntime* target_flr = GetFLR(target);
     const string& device_type = target_flr->device()->device_type();
@@ -761,8 +763,8 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
     opts.state_handle = options.state_handle;
     FunctionLibraryRuntime::Handle component_handle;
 
-    TF_RETURN_IF_ERROR(target_flr->Instantiate(
-        unique_name, AttrSlice(&shard.attr()), opts, &component_handle));
+    TF_RETURN_IF_ERROR(Instantiate(unique_name, AttrSlice(&shard.attr()), opts,
+                                   &component_handle));
     VLOG(1) << "Instantiated component function " << unique_name
             << " on device " << target << " with component handle "
             << component_handle;
@@ -859,34 +861,57 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
     opts_copy.rets_alloc_attrs = comp_data.ret_alloc_attrs_;
     opts_copy.remote_execution = false;
 
-    FunctionLibraryRuntime* flr = GetFLR(target);
-    // When target device has private thread pool, use the target device runner
-    thread::ThreadPool* pool = flr->device()->tensorflow_device_thread_pool();
-    opts_copy.runner = (pool == nullptr) ? opts_copy.runner : flr->runner();
-
     std::vector<Tensor> comp_args =
         GetArgsForIndices(comp_data.arg_indices_, args);
     std::vector<Tensor>* comp_rets = new std::vector<Tensor>;
     rets->resize(data->num_outputs_);
 
-    VLOG(1) << "Running component function on device " << target
-            << " with handle " << handle;
-    VLOG(4) << "    with " << opts_copy.DebugString();
-    flr->Run(
-        opts_copy, handle, comp_args, comp_rets,
-        [comp_rets, rets, comp_data, refcounted_done](const Status& status) {
-          if (!status.ok()) {
-            VLOG(2) << "Component function execution failed: " << status;
-            refcounted_done->UpdateStatus(status);
-          } else {
-            for (int i = 0; i < comp_rets->size(); ++i) {
-              (*rets)[comp_data.ret_indices_[i]] = (*comp_rets)[i];
+    FunctionLibraryRuntime* flr = GetFLR(target);
+    if (flr != nullptr) {
+      // When target device has private thread pool, use the target device
+      // runner
+      thread::ThreadPool* pool = flr->device()->tensorflow_device_thread_pool();
+      opts_copy.runner = (pool == nullptr) ? opts_copy.runner : flr->runner();
+
+      VLOG(1) << "Running component function on device " << target
+              << " with handle " << handle;
+      VLOG(4) << "    with " << opts_copy.DebugString();
+      flr->Run(
+          opts_copy, handle, comp_args, comp_rets,
+          [comp_rets, rets, comp_data, refcounted_done](const Status& status) {
+            if (!status.ok()) {
+              VLOG(2) << "Component function execution failed: " << status;
+              refcounted_done->UpdateStatus(status);
+            } else {
+              for (int i = 0; i < comp_rets->size(); ++i) {
+                (*rets)[comp_data.ret_indices_[i]] = (*comp_rets)[i];
+              }
             }
-          }
-          delete comp_rets;
-          // refcounted_done is thread-safe
-          refcounted_done->Unref();
-        });
+            delete comp_rets;
+            // refcounted_done is thread-safe
+            refcounted_done->Unref();
+          });
+    } else {
+      opts_copy.remote_execution = true;
+
+      VLOG(1) << "Running component function on device " << target
+              << " with handle " << handle;
+      VLOG(4) << "    with " << opts_copy.DebugString();
+      Run(opts_copy, handle, comp_args, comp_rets,
+          [comp_rets, rets, comp_data, refcounted_done](const Status& status) {
+            if (!status.ok()) {
+              VLOG(2) << "Component function execution failed: " << status;
+              refcounted_done->UpdateStatus(status);
+            } else {
+              for (int i = 0; i < comp_rets->size(); ++i) {
+                (*rets)[comp_data.ret_indices_[i]] = (*comp_rets)[i];
+              }
+            }
+            delete comp_rets;
+            // refcounted_done is thread-safe
+            refcounted_done->Unref();
+          });
+    }
   }
   refcounted_done->Unref();
 }
@@ -923,8 +948,10 @@ Status ProcessFunctionLibraryRuntime::Instantiate(
     f = function_data_[h].get();
     *handle = h;
   }
-  TF_RETURN_IF_ERROR(
-      f->DistributedInit(parent_, function_name, *lib_def_, attrs, options));
+  TF_RETURN_IF_ERROR(f->DistributedInit(
+      parent_, function_name,
+      options.lib_def == nullptr ? *lib_def_ : *options.lib_def, attrs,
+      options));
   VLOG(1) << "ProcessFLR Instantiate [success]: " << function_name
           << " on: " << options.target << " with handle: " << *handle
           << " (this: " << this << ")";
@@ -962,6 +989,13 @@ Status ProcessFunctionLibraryRuntime::ReleaseMultiDeviceHandle(
     FunctionLibraryRuntime::Handle flr_handle = it.second.handle_;
     FunctionLibraryRuntime* flr = GetFLR(device);
     if (flr == nullptr) {
+      // TODO(nareshmodi): Implement DeregisterGraph call to remote device if
+      // parent is not null.
+      if (parent_ != nullptr) {
+        return errors::Unimplemented(
+            "Releasing a multi-device component handle on a remote device is "
+            "not yet implemented.");
+      }
       return errors::InvalidArgument(
           "Failed to find FunctionLibraryRuntime for device ", device,
           " when releasing multi-device function handle ", handle);
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index 6e842ddcef8..ec0f9db5625 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -307,6 +307,7 @@ class ProcessFunctionLibraryRuntime {
 
   Env* const env_;
   const DeviceMgr* const device_mgr_;
+  DeviceSet device_set_;
   const FunctionLibraryDefinition* lib_def_;
   thread::ThreadPool* default_thread_pool_;
 
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index a73cb5a9f53..e1b36736555 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -34,8 +34,8 @@ limitations under the License.
 #include "tensorflow/core/public/version.h"
 
 #ifdef GOOGLE_CUDA
-#include "cuda/include/cuda.h"
-#include "cuda/include/cuda_runtime_api.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #endif  // GOOGLE_CUDA
 
 namespace tensorflow {
@@ -43,7 +43,7 @@ namespace {
 
 class TestClusterFLR : public DistributedFunctionLibraryRuntime {
  public:
-  TestClusterFLR() {}
+  explicit TestClusterFLR(DeviceMgr* device_mgr) : device_mgr_(device_mgr) {}
 
   Status Instantiate(const string& function_name,
                      const FunctionLibraryDefinition& lib_def, AttrSlice attrs,
@@ -60,9 +60,12 @@ class TestClusterFLR : public DistributedFunctionLibraryRuntime {
            gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
            FunctionLibraryRuntime::DoneCallback done) override {}
 
+  DeviceMgr* remote_device_mgr() const override { return device_mgr_; }
+
  private:
   mutex mu_;
   int next_handle_ GUARDED_BY(mu_) = 0;
+  DeviceMgr* device_mgr_;
 };
 
 // TODO(b/128707168): Tests requiring a GPU device are currently always skipped
@@ -101,7 +104,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
     for (const auto& fdef : flib) *(proto.add_function()) = fdef;
     lib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), proto));
     OptimizerOptions opts;
-    cluster_flr_.reset(new TestClusterFLR());
+    cluster_flr_.reset(new TestClusterFLR(device_mgr_.get()));
     proc_flr_.reset(new ProcessFunctionLibraryRuntime(
         device_mgr_.get(), Env::Default(), TF_GRAPH_DEF_VERSION, lib_def_.get(),
         opts, nullptr, cluster_flr_.get()));
diff --git a/tensorflow/core/common_runtime/scoped_allocator.h b/tensorflow/core/common_runtime/scoped_allocator.h
index 64e3373dfa0..683bbc7e9ed 100644
--- a/tensorflow/core/common_runtime/scoped_allocator.h
+++ b/tensorflow/core/common_runtime/scoped_allocator.h
@@ -106,7 +106,6 @@ class ScopedAllocatorInstance : public Allocator {
   }
   void DeallocateRaw(void* p) LOCKS_EXCLUDED(mu_) override;
   bool TracksAllocationSizes() const override { return false; }
-  bool ShouldAllocateEmptyTensors() const override { return false; }
   size_t RequestedSize(const void* ptr) const override { return 0; }
   size_t AllocatedSize(const void* ptr) const override { return 0; }
   int64 AllocationId(const void* ptr) const override { return 0; }
diff --git a/tensorflow/core/common_runtime/shared_counter.h b/tensorflow/core/common_runtime/shared_counter.h
index 5e378524b20..12aa79cd3f4 100644
--- a/tensorflow/core/common_runtime/shared_counter.h
+++ b/tensorflow/core/common_runtime/shared_counter.h
@@ -15,6 +15,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SHARED_COUNTER_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_SHARED_COUNTER_H_
 
+#include <atomic>
+
+#include "tensorflow/core/platform/types.h"
+
 namespace tensorflow {
 // A lightweight thread-safe monotone counter for establishing
 // temporal ordering.
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index ee2d61ec73a..7b2546050a6 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -129,6 +129,7 @@ cc_library(
     hdrs = ["session_mgr.h"],
     deps = [
         ":graph_mgr",
+        ":remote_device",
         ":worker_cache_wrapper",
         ":worker_session",
         "//tensorflow/core:core_cpu_internal",
@@ -199,6 +200,7 @@ tf_cuda_library(
     deps = [
         ":graph_mgr",
         ":partial_run_mgr",
+        ":recent_request_ids",
         ":rendezvous_mgr_interface",
         ":session_mgr",
         ":tensor_coding",
@@ -206,7 +208,6 @@ tf_cuda_library(
         ":worker_session",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/distributed_runtime:recent_request_ids",
         "//tensorflow/core/profiler/lib:profiler_session",
     ],
 )
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
index 08405e848c1..3a60ff0e02a 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
@@ -196,9 +196,7 @@ void ClusterFunctionLibraryRuntime::Run(
   req->set_session_handle(worker_session_->session_name);
   req->set_create_worker_session_called(create_worker_session_called_);
   req->set_graph_handle(function_data->graph_handle);
-  // Borrowed from master_session.cc
-  const uint64 step_id = (random::New64() & ((1uLL << 56) - 1)) | (1uLL << 56);
-  req->set_step_id(step_id);
+  req->set_step_id(opts.step_id);
   int i = 0;
   for (const auto& send_key : function_data->send_keys) {
     NamedTensorProto* send = req->add_send();
@@ -212,7 +210,7 @@ void ClusterFunctionLibraryRuntime::Run(
   }
 
   CleanupGraphRequest* cleanup_req = new CleanupGraphRequest;
-  cleanup_req->set_step_id(step_id);
+  cleanup_req->set_step_id(opts.step_id);
 
   RunGraphResponse* resp = new RunGraphResponse();
   CleanupGraphResponse* cleanup_resp = new CleanupGraphResponse;
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
index 1ea0a3ad515..28128f43177 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
@@ -28,9 +28,11 @@ struct WorkerSession;
 class ClusterFunctionLibraryRuntime : public DistributedFunctionLibraryRuntime {
  public:
   ClusterFunctionLibraryRuntime(WorkerSession* worker_session,
-                                bool create_worker_session_called)
+                                bool create_worker_session_called,
+                                DeviceMgr* remote_device_mgr)
       : worker_session_(worker_session),
-        create_worker_session_called_(create_worker_session_called) {}
+        create_worker_session_called_(create_worker_session_called),
+        remote_device_mgr_(remote_device_mgr) {}
 
   ~ClusterFunctionLibraryRuntime() override;
 
@@ -44,6 +46,8 @@ class ClusterFunctionLibraryRuntime : public DistributedFunctionLibraryRuntime {
            gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
            FunctionLibraryRuntime::DoneCallback done) override;
 
+  DeviceMgr* remote_device_mgr() const override { return remote_device_mgr_; }
+
  private:
   static Status ConstructFunctionGraph(
       const OpDef& sig, AttrSlice attrs,
@@ -55,6 +59,8 @@ class ClusterFunctionLibraryRuntime : public DistributedFunctionLibraryRuntime {
   WorkerSession* const worker_session_ = nullptr;  // not owned.
   const bool create_worker_session_called_;
 
+  DeviceMgr* remote_device_mgr_;  // not owned.
+
   struct FunctionData {
     const string graph_handle;
     const string target;
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc b/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
index cd6e1350140..45a11fe201b 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
@@ -42,10 +42,10 @@ class ClusterFunctionLibraryRuntimeTest : public ::testing::Test {
     worker_session_.reset(new WorkerSession(
         "cluster_test_session", "/job:localhost/replica:0/task:0",
         std::move(worker_cache), std::unique_ptr<DeviceMgr>(),
-        std::unique_ptr<GraphMgr>()));
+        std::unique_ptr<GraphMgr>(), nullptr));
 
-    cluster_flr_.reset(
-        new ClusterFunctionLibraryRuntime(worker_session_.get(), true));
+    cluster_flr_.reset(new ClusterFunctionLibraryRuntime(worker_session_.get(),
+                                                         true, nullptr));
   }
 
   Status ConstructFunctionGraphHelper(
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index fea9ef5bcca..81d6412e1bf 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -121,13 +121,14 @@ Status GraphMgr::DecorateAndPublishGraphForDebug(
 //
 // "executors" are filled with one executor per device if success and
 // the caller takes the ownership of returned executors.
-Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
+Status GraphMgr::InitItem(const string& handle, const GraphDef& gdef,
+                          WorkerSession* session,
                           const GraphOptions& graph_options,
                           const DebugOptions& debug_options,
                           int64 collective_graph_key,
                           DistributedFunctionLibraryRuntime* cluster_flr,
                           Item* item) {
-  item->session = session;
+  item->session = handle;
   item->collective_graph_key = collective_graph_key;
   item->lib_def.reset(
       new FunctionLibraryDefinition(OpRegistry::Global(), gdef.library()));
@@ -223,7 +224,7 @@ Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
     // kernels. Therefore, as long as the executor is alive, we need
     // to ensure the kernels cached for the session are alive.
     auto opseg = unit->device->op_segment();
-    opseg->AddHold(session);
+    opseg->AddHold(handle);
 
     // Function library runtime.
     FunctionLibraryRuntime* lib = item->proc_flr->GetFLR(unit->device->name());
@@ -235,8 +236,8 @@ Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
     // Construct the root executor for the subgraph.
     params.device = unit->device;
     params.function_library = lib;
-    params.create_kernel = [session, lib, opseg](const NodeDef& ndef,
-                                                 OpKernel** kernel) {
+    params.create_kernel = [handle, lib, opseg](const NodeDef& ndef,
+                                                OpKernel** kernel) {
       // NOTE(mrry): We must not share function kernels (implemented
       // using `CallOp`) between subgraphs, because `CallOp::handle_`
       // is tied to a particular subgraph. Even if the function itself
@@ -250,13 +251,21 @@ Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
       // Kernels created for subgraph nodes need to be cached.  On
       // cache miss, create_fn() is invoked to create a kernel based
       // on the function library here + global op registry.
-      return opseg->FindOrCreate(session, ndef.name(), kernel, create_fn);
+      return opseg->FindOrCreate(handle, ndef.name(), kernel, create_fn);
     };
     params.delete_kernel = [lib](OpKernel* kernel) {
       if (kernel && !OpSegment::ShouldOwnKernel(lib, kernel->type_string())) {
         delete kernel;
       }
     };
+    params.rendezvous_factory = [this, session](const int64 step_id,
+                                                const DeviceMgr*,
+                                                Rendezvous** r) -> Status {
+      auto* remote_r = this->worker_env_->rendezvous_mgr->Find(step_id);
+      TF_RETURN_IF_ERROR(remote_r->Initialize(session));
+      *r = remote_r;
+      return Status::OK();
+    };
 
     optimizer.Optimize(lib, worker_env_->env, params.device, &subgraph,
                        /*shape_map=*/nullptr);
@@ -281,14 +290,15 @@ Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
   return Status::OK();
 }
 
-Status GraphMgr::Register(const string& session, const GraphDef& gdef,
+Status GraphMgr::Register(const string& handle, const GraphDef& gdef,
+                          WorkerSession* session,
                           const GraphOptions& graph_options,
                           const DebugOptions& debug_options,
                           int64 collective_graph_key,
                           DistributedFunctionLibraryRuntime* cluster_flr,
-                          string* handle) {
+                          string* graph_handle) {
   Item* item = new Item;
-  Status s = InitItem(session, gdef, graph_options, debug_options,
+  Status s = InitItem(handle, gdef, session, graph_options, debug_options,
                       collective_graph_key, cluster_flr, item);
   if (!s.ok()) {
     item->Unref();
@@ -298,9 +308,9 @@ Status GraphMgr::Register(const string& session, const GraphDef& gdef,
   // Inserts one item into table_.
   {
     mutex_lock l(mu_);
-    *handle = strings::Printf("%016llx", ++next_id_);
-    item->handle = *handle;
-    CHECK(table_.insert({*handle, item}).second);
+    *graph_handle = strings::Printf("%016llx", ++next_id_);
+    item->handle = *graph_handle;
+    CHECK(table_.insert({*graph_handle, item}).second);
   }
   return Status::OK();
 }
@@ -472,29 +482,26 @@ void GraphMgr::ExecuteAsync(const string& handle, const int64 step_id,
     return;
   }
 
-  StartParallelExecutors(
-      handle, step_id, item, rendezvous, ce_handle, collector, cost_graph,
-      cancellation_manager,
-      [item, rendezvous, ce_handle, done, start_time_usecs, input_size,
-       activity](const Status& s) {
-        done(s);
-        metrics::RecordGraphInputTensors(input_size);
-        metrics::UpdateGraphExecTime(Env::Default()->NowMicros() -
-                                     start_time_usecs);
-        rendezvous->Unref();
-        item->Unref();
-        delete activity;
-        delete ce_handle;
-      });
+  StartParallelExecutors(handle, step_id, item, rendezvous, ce_handle,
+                         collector, cost_graph, cancellation_manager, session,
+                         [item, rendezvous, ce_handle, done, start_time_usecs,
+                          input_size, activity](const Status& s) {
+                           done(s);
+                           metrics::RecordGraphInputTensors(input_size);
+                           metrics::UpdateGraphExecTime(
+                               Env::Default()->NowMicros() - start_time_usecs);
+                           rendezvous->Unref();
+                           item->Unref();
+                           delete activity;
+                           delete ce_handle;
+                         });
 }
 
-void GraphMgr::StartParallelExecutors(const string& handle, int64 step_id,
-                                      Item* item, Rendezvous* rendezvous,
-                                      CollectiveExecutor::Handle* ce_handle,
-                                      StepStatsCollector* collector,
-                                      CostGraphDef* cost_graph,
-                                      CancellationManager* cancellation_manager,
-                                      StatusCallback done) {
+void GraphMgr::StartParallelExecutors(
+    const string& handle, int64 step_id, Item* item, Rendezvous* rendezvous,
+    CollectiveExecutor::Handle* ce_handle, StepStatsCollector* collector,
+    CostGraphDef* cost_graph, CancellationManager* cancellation_manager,
+    WorkerSession* session, StatusCallback done) {
   const int num_units = item->units.size();
   CHECK_GE(num_units, 1);
   ScopedStepContainer* step_container = new ScopedStepContainer(
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.h b/tensorflow/core/distributed_runtime/graph_mgr.h
index 5196046c196..fcd316dba44 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.h
+++ b/tensorflow/core/distributed_runtime/graph_mgr.h
@@ -74,11 +74,11 @@ class GraphMgr {
 
   // Registers a graph. Fills in "handle". The registered graph retains a
   // reference to cluster_flr to do cross process function calls.
-  Status Register(const string& session, const GraphDef& gdef,
-                  const GraphOptions& graph_options,
+  Status Register(const string& handle, const GraphDef& gdef,
+                  WorkerSession* session, const GraphOptions& graph_options,
                   const DebugOptions& debug_options, int64 collective_graph_key,
                   DistributedFunctionLibraryRuntime* cluster_flr,
-                  string* handle);
+                  string* graph_handle);
 
   // Executes one step of a registered graph "handle".
   //
@@ -168,7 +168,7 @@ class GraphMgr {
                               StepStatsCollector* collector,
                               CostGraphDef* cost_graph,
                               CancellationManager* cancellation_manager,
-                              StatusCallback done);
+                              WorkerSession* session, StatusCallback done);
 
   // Don't attempt to process cost models unless explicitly requested for at
   // least one of the items.
@@ -177,8 +177,8 @@ class GraphMgr {
   void BuildCostModel(Item* item, StepStatsCollector* collector,
                       CostGraphDef* cost_graph);
 
-  Status InitItem(const string& session, const GraphDef& gdef,
-                  const GraphOptions& graph_options,
+  Status InitItem(const string& handle, const GraphDef& gdef,
+                  WorkerSession* session, const GraphOptions& graph_options,
                   const DebugOptions& debug_options, int64 collective_graph_key,
                   DistributedFunctionLibraryRuntime* cluster_flr, Item* item);
 
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 70f3358adbe..5c55067d6cc 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -548,19 +548,13 @@ class RunManyGraphs {
   bool cancel_issued_ GUARDED_BY(mu_) = false;
 
   void ReportBadStatus(const Status& s) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    // Start cancellation if we aren't already in an error state.
-    // TODO(jingdong): Change the following log to VLOG once the distributed
-    // error aggregation is stable.
-    LOG(INFO) << "Master received error status " << s;
+    VLOG(1) << "Master received error status " << s;
     if (!cancel_issued_ && !StatusGroup::IsDerived(s)) {
       // Only start cancelling other workers upon receiveing a non-derived
       // error
       cancel_issued_ = true;
 
-      // TODO(jingdong): Change the following log to VLOG once the distributed
-      // error aggregation feature is stable.
-      LOG(INFO)
-          << "Master received error report. Cancelling remaining workers.";
+      VLOG(1) << "Master received error report. Cancelling remaining workers.";
       for (Call& call : calls_) {
         call.opts.StartCancel();
       }
@@ -1287,6 +1281,13 @@ Status MasterSession::CreateWorkerSessions(
     workers[i].name = &worker_names[i];
     workers[i].worker = worker_cache->CreateWorker(worker_names[i]);
     workers[i].request.set_session_handle(handle_);
+    if (session_opts_.config.experimental()
+            .share_cluster_devices_in_session()) {
+      for (const auto& remote_dev : devices_->devices()) {
+        *workers[i].request.add_cluster_device_attributes() =
+            remote_dev->attributes();
+      }
+    }
 
     DeviceNameUtils::ParsedName name;
     if (!DeviceNameUtils::ParseFullName(worker_names[i], &name)) {
diff --git a/tensorflow/core/distributed_runtime/remote_device.cc b/tensorflow/core/distributed_runtime/remote_device.cc
index a043c5dee6b..f0fc6666b29 100644
--- a/tensorflow/core/distributed_runtime/remote_device.cc
+++ b/tensorflow/core/distributed_runtime/remote_device.cc
@@ -54,6 +54,16 @@ class RemoteDevice : public Device {
   TF_DISALLOW_COPY_AND_ASSIGN(RemoteDevice);
 };
 
+void AsRemoteDevices(
+    Env* env,
+    const protobuf::RepeatedPtrField<DeviceAttributes>& device_attributes,
+    std::vector<std::unique_ptr<Device>>* remote_devices) {
+  for (const auto& da : device_attributes) {
+    auto d = new RemoteDevice(env, da);
+    remote_devices->emplace_back(d);
+  }
+}
+
 void NewRemoteDevices(Env* env, WorkerCacheInterface* worker_cache,
                       const string& worker_name, NewRemoteDevicesDone done) {
   WorkerInterface* wi = worker_cache->CreateWorker(worker_name);
diff --git a/tensorflow/core/distributed_runtime/remote_device.h b/tensorflow/core/distributed_runtime/remote_device.h
index 686af9547d6..1b2a4cd6279 100644
--- a/tensorflow/core/distributed_runtime/remote_device.h
+++ b/tensorflow/core/distributed_runtime/remote_device.h
@@ -19,13 +19,23 @@ limitations under the License.
 #include <functional>
 #include <string>
 #include <vector>
+
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
+class DeviceAttributes;
 class Device;
 class Env;
 class WorkerCacheInterface;
 
+// Creates Remote Devices for the provided device attributes. Helpful when the
+// list of attributes is known, and doesn't need to be discovered via RPC.
+void AsRemoteDevices(
+    Env* env,
+    const protobuf::RepeatedPtrField<DeviceAttributes>& device_attributes,
+    std::vector<std::unique_ptr<Device>>* remote_devices);
+
 // NewRemoteDevices discovers available devices on the
 // 'remote_worker'.  The implementation uses 'channel_cache' to
 // discover how to communicate with the 'remote_worker' (via gRPC, for
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h b/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h
index b3ea64b4883..279e982c4b4 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/platform/mutex.h"
 
 // gRPC response caching.  Most WorkerService methods cannot be retried directly
 // as they will fail or deadlock.  To enable retrying, we can instead cache
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
index b070dd13dd6..28ac30d07ae 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
@@ -71,7 +71,7 @@ class RpcRendezvousMgrTest : public ::testing::Test {
         worker_session_("rpc_session", "/job:mnist/replica:1/task:2",
                         std::unique_ptr<WorkerCacheInterface>(cache_),
                         std::unique_ptr<DeviceMgr>(),
-                        std::unique_ptr<GraphMgr>()),
+                        std::unique_ptr<GraphMgr>(), nullptr),
         rmgr_(&env) {
     env.env = Env::Default();
   }
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index 868f0f87995..ace4e456ce2 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/renamed_device.h"
 #include "tensorflow/core/distributed_runtime/graph_mgr.h"
+#include "tensorflow/core/distributed_runtime/remote_device.h"
 #include "tensorflow/core/distributed_runtime/worker_cache_wrapper.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/protobuf/cluster.pb.h"
@@ -40,18 +41,27 @@ SessionMgr::SessionMgr(
               new WorkerCacheWrapper(default_worker_cache_.get())),
           worker_env->device_mgr,
           std::unique_ptr<GraphMgr>(
-              new GraphMgr(worker_env, worker_env->device_mgr)))),
+              new GraphMgr(worker_env, worker_env->device_mgr)),
+          nullptr)),
       worker_cache_factory_(std::move(worker_cache_factory)) {}
 
 /* static */
 string SessionMgr::WorkerNameFromServerDef(const ServerDef& server_def) {
-  return strings::StrCat("/job:", server_def.job_name(), "/replica:0/task:",
-                         server_def.task_index());
+  return strings::StrCat("/job:", server_def.job_name(),
+                         "/replica:0/task:", server_def.task_index());
 }
 
 Status SessionMgr::CreateSession(const string& session,
                                  const ServerDef& server_def,
                                  bool isolate_session_state) {
+  return CreateSession(session, server_def, {}, isolate_session_state);
+}
+
+Status SessionMgr::CreateSession(
+    const string& session, const ServerDef& server_def,
+    const protobuf::RepeatedPtrField<DeviceAttributes>&
+        cluster_device_attributes,
+    bool isolate_session_state) {
   mutex_lock l(mu_);
   if (session.empty()) {
     return errors::InvalidArgument("Session must be non-empty.");
@@ -76,6 +86,14 @@ Status SessionMgr::CreateSession(const string& session,
 
   std::shared_ptr<WorkerSession> worker_session;
 
+  std::unique_ptr<DeviceMgr> remote_devices;
+  if (!cluster_device_attributes.empty()) {
+    std::vector<std::unique_ptr<Device>> cluster_devices;
+    tensorflow::AsRemoteDevices(worker_env_->env, cluster_device_attributes,
+                                &cluster_devices);
+    remote_devices.reset(new DeviceMgr(std::move(cluster_devices)));
+  }
+
   if (isolate_session_state || server_def.cluster().job_size()) {
     if (server_def.cluster().job_size()) {
       VLOG(1) << "ClusterSpec propagation is enabled.";
@@ -96,16 +114,18 @@ Status SessionMgr::CreateSession(const string& session,
     worker_session.reset(
         new WorkerSession(session, worker_name,
                           std::unique_ptr<WorkerCacheInterface>(worker_cache),
-                          std::move(device_mgr), std::move(graph_mgr)));
+                          std::move(device_mgr), std::move(graph_mgr),
+                          std::move(remote_devices)));
   } else {
-    // Borrown the WorkerEnv's DeviceMgr for the WorkerSession, so
+    // Borrow the WorkerEnv's DeviceMgr for the WorkerSession, so
     // that resources using it can use its devices after the
     // WorkerSession has been deleted.
     auto graph_mgr = MakeUnique<GraphMgr>(worker_env_, worker_env_->device_mgr);
     worker_session = WorkerSession::CreateWithBorrowedDeviceMgr(
         session, worker_name,
         std::unique_ptr<WorkerCacheInterface>(worker_cache),
-        worker_env_->device_mgr, std::move(graph_mgr));
+        worker_env_->device_mgr, std::move(graph_mgr),
+        std::move(remote_devices));
   }
 
   sessions_.insert(std::make_pair(session, std::move(worker_session)));
diff --git a/tensorflow/core/distributed_runtime/session_mgr.h b/tensorflow/core/distributed_runtime/session_mgr.h
index 04d1d614098..22bbe82dfbc 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.h
+++ b/tensorflow/core/distributed_runtime/session_mgr.h
@@ -48,6 +48,10 @@ class SessionMgr {
   // Allocates state for a new session.
   Status CreateSession(const string& session, const ServerDef& server_def,
                        bool isolate_session_state);
+  Status CreateSession(
+      const string& session, const ServerDef& server_def,
+      const protobuf::RepeatedPtrField<DeviceAttributes>& device_attributes,
+      bool isolate_session_state);
 
   // Locates the worker session for a given session handle
   Status WorkerSessionForSession(const string& session_handle,
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index e31cad76dd2..47c06514fc1 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -45,9 +45,9 @@ void Worker::GetStatusAsync(const GetStatusRequest* request,
 void Worker::CreateWorkerSessionAsync(const CreateWorkerSessionRequest* request,
                                       CreateWorkerSessionResponse* response,
                                       StatusCallback done) {
-  Status s = env_->session_mgr->CreateSession(request->session_handle(),
-                                              request->server_def(),
-                                              request->isolate_session_state());
+  Status s = env_->session_mgr->CreateSession(
+      request->session_handle(), request->server_def(),
+      request->cluster_device_attributes(), request->isolate_session_state());
   done(s);
 }
 
@@ -72,7 +72,7 @@ void Worker::RegisterGraphAsync(const RegisterGraphRequest* request,
   }
   if (s.ok()) {
     s = session->graph_mgr->Register(
-        request->session_handle(), request->graph_def(),
+        request->session_handle(), request->graph_def(), session.get(),
         request->graph_options(), request->debug_options(),
         request->collective_graph_key(), session->cluster_flr.get(),
         response->mutable_graph_handle());
diff --git a/tensorflow/core/distributed_runtime/worker_session.cc b/tensorflow/core/distributed_runtime/worker_session.cc
index d53997e53f3..1a716c618f2 100644
--- a/tensorflow/core/distributed_runtime/worker_session.cc
+++ b/tensorflow/core/distributed_runtime/worker_session.cc
@@ -98,15 +98,18 @@ WorkerSession::WorkerSession(const string& session_name,
                              const string& worker_name,
                              std::unique_ptr<WorkerCacheInterface> worker_cache,
                              std::unique_ptr<DeviceMgr> device_mgr,
-                             std::unique_ptr<GraphMgr> graph_mgr)
+                             std::unique_ptr<GraphMgr> graph_mgr,
+                             std::unique_ptr<DeviceMgr> remote_device_mgr)
     : session_name(session_name),
       worker_name(worker_name),
       worker_cache(new WorkerFreeListCache(std::move(worker_cache))),
       graph_mgr(std::move(graph_mgr)),
-      cluster_flr(
-          new ClusterFunctionLibraryRuntime(this, !session_name.empty())),
+      cluster_flr(new ClusterFunctionLibraryRuntime(
+          this, !session_name.empty(),
+          remote_device_mgr ? remote_device_mgr.get() : nullptr)),
       device_mgr_(std::move(device_mgr)),
-      borrowed_device_mgr_(nullptr) {
+      borrowed_device_mgr_(nullptr),
+      remote_device_mgr_(std::move(remote_device_mgr)) {
   // Starts exporting metrics through a platform-specific monitoring API (if
   // provided). For builds using "tensorflow/core/platform/default", this is
   // currently a no-op.
@@ -117,25 +120,28 @@ WorkerSession::WorkerSession(const string& session_name,
 std::shared_ptr<WorkerSession> WorkerSession::CreateWithBorrowedDeviceMgr(
     const string& session_name, const string& worker_name,
     std::unique_ptr<WorkerCacheInterface> worker_cache,
-    DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr) {
-  return std::shared_ptr<WorkerSession>(
-      new WorkerSession(session_name, worker_name, std::move(worker_cache),
-                        borrowed_device_mgr, std::move(graph_mgr)));
+    DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
+    std::unique_ptr<DeviceMgr> remote_device_mgr) {
+  return std::shared_ptr<WorkerSession>(new WorkerSession(
+      session_name, worker_name, std::move(worker_cache), borrowed_device_mgr,
+      std::move(graph_mgr), std::move(remote_device_mgr)));
 }
 
 WorkerSession::WorkerSession(const string& session_name,
                              const string& worker_name,
                              std::unique_ptr<WorkerCacheInterface> worker_cache,
                              DeviceMgr* borrowed_device_mgr,
-                             std::unique_ptr<GraphMgr> graph_mgr)
+                             std::unique_ptr<GraphMgr> graph_mgr,
+                             std::unique_ptr<DeviceMgr> remote_device_mgr)
     : session_name(session_name),
       worker_name(worker_name),
       worker_cache(new WorkerFreeListCache(std::move(worker_cache))),
       graph_mgr(std::move(graph_mgr)),
-      cluster_flr(
-          new ClusterFunctionLibraryRuntime(this, !session_name.empty())),
+      cluster_flr(new ClusterFunctionLibraryRuntime(this, !session_name.empty(),
+                                                    remote_device_mgr.get())),
       device_mgr_(nullptr),
-      borrowed_device_mgr_(borrowed_device_mgr) {
+      borrowed_device_mgr_(borrowed_device_mgr),
+      remote_device_mgr_(std::move(remote_device_mgr)) {
   // Starts exporting metrics through a platform-specific monitoring API (if
   // provided). For builds using "tensorflow/core/platform/default", this is
   // currently a no-op.
diff --git a/tensorflow/core/distributed_runtime/worker_session.h b/tensorflow/core/distributed_runtime/worker_session.h
index f1faf493647..90b656f7a47 100644
--- a/tensorflow/core/distributed_runtime/worker_session.h
+++ b/tensorflow/core/distributed_runtime/worker_session.h
@@ -60,12 +60,14 @@ struct WorkerSession {
   WorkerSession(const string& session_name, const string& worker_name,
                 std::unique_ptr<WorkerCacheInterface> worker_cache,
                 std::unique_ptr<DeviceMgr> device_mgr,
-                std::unique_ptr<GraphMgr> graph_mgr);
+                std::unique_ptr<GraphMgr> graph_mgr,
+                std::unique_ptr<DeviceMgr> remote_device_mgr);
 
   static std::shared_ptr<WorkerSession> CreateWithBorrowedDeviceMgr(
       const string& session_name, const string& worker_name,
       std::unique_ptr<WorkerCacheInterface> worker_cache,
-      DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr);
+      DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
+      std::unique_ptr<DeviceMgr> remote_device_mgr);
 
   ~WorkerSession();
 
@@ -73,10 +75,12 @@ struct WorkerSession {
   WorkerSession(const string& session_name, const string& worker_name,
                 std::unique_ptr<WorkerCacheInterface> worker_cache,
                 DeviceMgr* borrowed_device_mgr,
-                std::unique_ptr<GraphMgr> graph_mgr);
+                std::unique_ptr<GraphMgr> graph_mgr,
+                std::unique_ptr<DeviceMgr> remote_device_mgr);
 
   const std::unique_ptr<DeviceMgr> device_mgr_;
   DeviceMgr* const borrowed_device_mgr_;  // Not owned.
+  const std::unique_ptr<DeviceMgr> remote_device_mgr_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index dd2d5161157..7ab87d1f8a4 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -15,10 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 
+#include <atomic>
+
 #include "tensorflow/core/framework/allocator_registry.h"
-#include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/tracking_allocator.h"
-#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -41,22 +42,6 @@ constexpr size_t Allocator::kAllocatorAlignment;
 
 Allocator::~Allocator() {}
 
-void RunResourceCtor(ResourceHandle* p, size_t n) {
-  for (size_t i = 0; i < n; ++p, ++i) new (p) ResourceHandle();
-}
-
-void RunResourceDtor(ResourceHandle* p, size_t n) {
-  for (size_t i = 0; i < n; ++p, ++i) p->~ResourceHandle();
-}
-
-void Allocator::RunVariantCtor(Variant* p, size_t n) {
-  for (size_t i = 0; i < n; ++p, ++i) new (p) Variant();
-}
-
-void Allocator::RunVariantDtor(Variant* p, size_t n) {
-  for (size_t i = 0; i < n; ++p, ++i) p->~Variant();
-}
-
 // If true, cpu allocator collects more stats.
 static bool cpu_allocator_collect_stats = false;
 // If true, cpu allocator collects full stats.
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 36c7e7bfa2f..861e855c3d5 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -18,23 +18,20 @@ limitations under the License.
 
 #include <stdlib.h>
 
+#include <functional>
 #include <limits>
 
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/framework/numeric_types.h"
-#include "tensorflow/core/framework/resource_handle.h"
 #include "tensorflow/core/framework/type_traits.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/numa.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
-class Variant;
-
 // Attributes for a single allocation call. Different calls to the same
 // allocator could potentially have different allocation attributes.
 struct AllocationAttributes {
@@ -129,51 +126,25 @@ class Allocator {
   // REQUIRES: "ptr" was previously returned by a call to AllocateRaw
   virtual void DeallocateRaw(void* ptr) = 0;
 
-  // Convenience functions to do typed allocation.  C++ constructors
-  // and destructors are invoked for complex types if necessary,
-  // depending on the concrete Allocator implementation. May return
-  // NULL if the tensor has too many elements to represent in a single
-  // allocation.
-  template <typename T>
-  T* Allocate(size_t num_elements) {
-    return Allocate<T>(num_elements, AllocationAttributes());
-  }
-
-  template <typename T>
-  T* Allocate(size_t num_elements,
-              const AllocationAttributes& allocation_attr) {
-    // TODO(jeff): Do we need to allow clients to pass in alignment
-    // requirements?
-
-    if (num_elements > (std::numeric_limits<size_t>::max() / sizeof(T))) {
-      return NULL;
-    }
-
-    void* p = AllocateRaw(kAllocatorAlignment, sizeof(T) * num_elements,
-                          allocation_attr);
-    T* typed_p = reinterpret_cast<T*>(p);
-    if (typed_p) RunCtor<T>(typed_p, num_elements);
-    return typed_p;
-  }
-
-  template <typename T>
-  void Deallocate(T* ptr, size_t num_elements) {
-    if (ptr) {
-      RunDtor<T>(ptr, num_elements);
-      DeallocateRaw(ptr);
-    }
-  }
-
   // Returns true if this allocator tracks the sizes of allocations.
   // RequestedSize and AllocatedSize must be overridden if
   // TracksAllocationSizes is overridden to return true.
   virtual bool TracksAllocationSizes() const { return false; }
 
-  // Returns true if this allocator requires tensors with 0 elements
-  // to allocate buffers. This is false for most allocators, but may
-  // be used by special-case allocators that want to track tensor
-  // usage.
-  virtual bool ShouldAllocateEmptyTensors() const { return false; }
+  // Returns true if this allocator allocates an opaque handle rather than the
+  // requested number of bytes.
+  //
+  // This method returns false for most allocators, but may be used by
+  // special-case allocators that track tensor usage. If this method returns
+  // true, AllocateRaw() should be invoked for all values of `num_bytes`,
+  // including 0.
+  //
+  // NOTE: It is the caller's responsibility to track whether an allocated
+  // object is a buffer or an opaque handle. In particular, when this method
+  // returns `true`, users of this allocator must not run any constructors or
+  // destructors for complex objects, since there is no backing store for the
+  // tensor in which to place their outputs.
+  virtual bool AllocatesOpaqueHandle() const { return false; }
 
   // Returns the user-requested size of the data allocated at
   // 'ptr'.  Note that the actual buffer allocated might be larger
@@ -232,80 +203,8 @@ class Allocator {
   virtual void ClearStats() {}
 
   virtual void SetSafeFrontier(uint64 count) {}
-
- private:
-  // No constructors or destructors are run for simple types
-  template <typename T>
-  void RunCtor(T* p, size_t n) {
-    static_assert(is_simple_type<T>::value, "T is not a simple type.");
-  }
-
-  template <typename T>
-  void RunDtor(T* p, size_t n) {}
-
-  // custom constructors and destructors that can be overridden for
-  // non-standard allocators
-
-  // Runs string's default constructor for  p[0], p[1], ..., p[n-1].
-  virtual void RunStringCtor(string* p, size_t n) {
-    for (size_t i = 0; i < n; ++p, ++i) new (p) string();
-  }
-
-  // Runs string's default destructor for  p[0], p[1], ..., p[n-1].
-  virtual void RunStringDtor(string* p, size_t n) {
-    for (size_t i = 0; i < n; ++p, ++i) p->~string();
-  }
-
-  virtual void RunResourceCtor(ResourceHandle* p, size_t n) {
-    for (size_t i = 0; i < n; ++p, ++i) new (p) ResourceHandle();
-  }
-
-  // Runs string's default destructor for  p[0], p[1], ..., p[n-1].
-  virtual void RunResourceDtor(ResourceHandle* p, size_t n) {
-    for (size_t i = 0; i < n; ++p, ++i) p->~ResourceHandle();
-  }
-
-  virtual void RunVariantCtor(Variant* p, size_t n);
-
-  virtual void RunVariantDtor(Variant* p, size_t n);
-
-  // TODO(jeff): Maybe provide some interface to give info about
-  // current allocation state (total number of bytes available for
-  // allocation, number of bytes free on device, etc.)
 };
 
-// Allocator-specific constructors and destructors are used for
-// strings
-template <>
-inline void Allocator::RunCtor(string* p, size_t n) {
-  RunStringCtor(p, n);
-}
-
-template <>
-inline void Allocator::RunDtor(string* p, size_t n) {
-  RunStringDtor(p, n);
-}
-
-template <>
-inline void Allocator::RunCtor(ResourceHandle* p, size_t n) {
-  RunResourceCtor(p, n);
-}
-
-template <>
-inline void Allocator::RunDtor(ResourceHandle* p, size_t n) {
-  RunResourceDtor(p, n);
-}
-
-template <>
-inline void Allocator::RunCtor(Variant* p, size_t n) {
-  RunVariantCtor(p, n);
-}
-
-template <>
-inline void Allocator::RunDtor(Variant* p, size_t n) {
-  RunVariantDtor(p, n);
-}
-
 // An implementation of Allocator that delegates all calls to another Allocator.
 //
 // Useful to clients who want to override part of the functionality of another
@@ -336,8 +235,8 @@ class AllocatorWrapper : public Allocator {
     return wrapped_->TracksAllocationSizes();
   }
 
-  bool ShouldAllocateEmptyTensors() const override {
-    return wrapped_->TracksAllocationSizes();
+  bool AllocatesOpaqueHandle() const override {
+    return wrapped_->AllocatesOpaqueHandle();
   }
 
   size_t RequestedSize(const void* ptr) const override {
diff --git a/tensorflow/core/framework/allocator_test.cc b/tensorflow/core/framework/allocator_test.cc
index 85e8ba6a71b..3d03b2da1d3 100644
--- a/tensorflow/core/framework/allocator_test.cc
+++ b/tensorflow/core/framework/allocator_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <vector>
 
+#include "tensorflow/core/framework/typed_allocator.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -102,14 +103,14 @@ TEST(CPUAllocatorTest, Simple) {
     a->DeallocateRaw(ptrs[i]);
   }
   CheckStats(a, 1023, 0, 552640, 1024);
-  float* t1 = a->Allocate<float>(1024);
-  double* t2 = a->Allocate<double>(1048576);
+  float* t1 = TypedAllocator::Allocate<float>(a, 1024, {});
+  double* t2 = TypedAllocator::Allocate<double>(a, 1048576, {});
   CheckStats(a, 1025, 1048576 * sizeof(double) + 1024 * sizeof(float),
              1048576 * sizeof(double) + 1024 * sizeof(float),
              1048576 * sizeof(double));
 
-  a->Deallocate(t1, 1024);
-  a->Deallocate(t2, 1048576);
+  TypedAllocator::Deallocate(a, t1, 1024);
+  TypedAllocator::Deallocate(a, t2, 1048576);
 
   CheckStats(a, 1025, 0, 1048576 * sizeof(double) + 1024 * sizeof(float),
              1048576 * sizeof(double));
@@ -130,7 +131,8 @@ TEST(CPUAllocatorTest, AllocateOverflowMaxSizeT) {
 
   // The maximum size_t value will definitely overflow.
   size_t count_to_allocate = std::numeric_limits<size_t>::max();
-  TestStruct* const test_pointer = a->Allocate<TestStruct>(count_to_allocate);
+  TestStruct* const test_pointer =
+      TypedAllocator::Allocate<TestStruct>(a, count_to_allocate, {});
 
   CHECK_EQ(test_pointer, reinterpret_cast<TestStruct*>(NULL));
 }
@@ -141,7 +143,8 @@ TEST(CPUAllocatorTest, AllocateOverflowSmallest) {
   // count_to_allocate is the smallest count that will cause overflow.
   const size_t count_to_allocate =
       (std::numeric_limits<size_t>::max() / sizeof(TestStruct)) + 1;
-  TestStruct* const test_pointer = a->Allocate<TestStruct>(count_to_allocate);
+  TestStruct* const test_pointer =
+      TypedAllocator::Allocate<TestStruct>(a, count_to_allocate, {});
 
   CHECK_EQ(test_pointer, reinterpret_cast<TestStruct*>(NULL));
 }
diff --git a/tensorflow/core/framework/dataset.cc b/tensorflow/core/framework/dataset.cc
index b0533fbc508..200b799c2ce 100644
--- a/tensorflow/core/framework/dataset.cc
+++ b/tensorflow/core/framework/dataset.cc
@@ -50,6 +50,14 @@ class DatasetVariantWrapper {
     if (dataset_) dataset_->Ref();
   }
 
+  DatasetVariantWrapper& operator=(DatasetVariantWrapper&& other) {
+    if (&other == this) return *this;
+    std::swap(dataset_, other.dataset_);
+    return *this;
+  }
+
+  DatasetVariantWrapper& operator=(const DatasetVariantWrapper& other) = delete;
+
   ~DatasetVariantWrapper() {
     if (dataset_) dataset_->Unref();
   }
@@ -75,7 +83,7 @@ class DatasetVariantWrapper {
   }
 
  private:
-  DatasetBase* const dataset_;  // Owns one reference.
+  DatasetBase* dataset_;  // Owns one reference.
 };
 
 const char kWrappedDatasetVariantTypeName[] =
@@ -405,7 +413,7 @@ Status DatasetBase::DatasetGraphDefBuilder::AddInputDataset(
 Status DatasetBaseIterator::GetNext(IteratorContext* ctx,
                                     std::vector<Tensor>* out_tensors,
                                     bool* end_of_sequence) {
-  profiler::TraceMe activity(absl::string_view(params_.prefix),
+  profiler::TraceMe activity([&] { return BuildTraceMeName(); },
                              profiler::TraceMeLevel::kInfo);
   RecordStart(ctx, /*stop_output=*/true);
   Status s = GetNextInternal(ctx, out_tensors, end_of_sequence);
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 0bcdf52981f..f86bc741cb8 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -232,8 +232,8 @@ class GraphDefBuilderWrapper {
   // Also looks up the `op_def->name` in the global
   // `WhitelistedStatefulOpRegistry`.
   bool IsOpWhitelisted(const OpDef* op_def) const {
-    return ((str_util::EndsWith(op_def->name(), "Dataset") ||
-             str_util::EndsWith(op_def->name(), "DatasetV2")) &&
+    return ((absl::EndsWith(op_def->name(), "Dataset") ||
+             absl::EndsWith(op_def->name(), "DatasetV2")) &&
             op_def->output_arg_size() == 1 &&
             op_def->output_arg(0).type() == DT_VARIANT) ||
            WhitelistedStatefulOpRegistry::Global()->Contains(op_def->name());
@@ -731,9 +731,6 @@ class DatasetBaseIterator : public IteratorBase {
 
   ~DatasetBaseIterator() override { params_.dataset->Unref(); }
 
-  // The sequence of iterators leading up to this iterator.
-  const string& prefix() const override { return params_.prefix; }
-
   const DataTypeVector& output_dtypes() const override {
     return params_.dataset->output_dtypes();
   }
@@ -742,6 +739,15 @@ class DatasetBaseIterator : public IteratorBase {
     return params_.dataset->output_shapes();
   }
 
+  // The sequence of iterators leading up to this iterator.
+  const string& prefix() const override { return params_.prefix; }
+
+  // Returns a name to be used for the TraceMe event.
+  //
+  // NOTE: TraceMe support passing key value pairs of "arguments" using the
+  // following format "name#arg_1=value_,...,arg_n=value_n".
+  virtual string BuildTraceMeName() { return params_.prefix; }
+
   Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
                  bool* end_of_sequence) final;
 
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 5766109603f..b6ef479ad24 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_FUNCTION_H_
 
 #include <vector>
+
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -27,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -628,8 +630,13 @@ class FunctionLibraryRuntime {
   // In the cross-process scenario, runner isn't used for making the Async
   // RPC calls.
   struct Options {
-    // The id of the step that is calling this function.
-    int64 step_id = 0;
+    // Choose a step ID that is guaranteed not to clash with any
+    // Session-generated step ID. DirectSession only generates
+    // non-negative step IDs (contiguous, starting from 0), and
+    // MasterSession generates 56-bit random step IDs whose MSB is
+    // always 0, so a negative random step ID should suffice.
+    const int64 step_id = -std::abs(static_cast<int64>(random::New64()));
+
     Rendezvous* rendezvous = nullptr;
     CancellationManager* cancellation_manager = nullptr;
     CollectiveExecutor* collective_executor = nullptr;
@@ -785,6 +792,9 @@ class DistributedFunctionLibraryRuntime {
                    FunctionLibraryRuntime::LocalHandle handle,
                    gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
                    FunctionLibraryRuntime::DoneCallback done) = 0;
+
+  // DeviceMgr with *all* available devices.
+  virtual DeviceMgr* remote_device_mgr() const = 0;
 };
 
 // Extracts the actual type from "attr_values" based on its definition
diff --git a/tensorflow/core/framework/function_testlib.cc b/tensorflow/core/framework/function_testlib.cc
index 7c2f06aea87..4cc8d12bb18 100644
--- a/tensorflow/core/framework/function_testlib.cc
+++ b/tensorflow/core/framework/function_testlib.cc
@@ -569,6 +569,23 @@ FunctionDef MakeTensorSliceDataset() {
          {"output_shapes", "$output_shapes"}}}});
 }
 
+FunctionDef Unique() {
+  return FDH::Create(
+      // Name
+      "GetUnique",
+      // Args
+      {"x:T"},
+      // Return values
+      {"y:T", "idx: out_idx"},
+      // Attr def
+      {"T: type", "out_idx: {int32, int64} = DT_INT32"},
+      // Nodes
+      {
+          {{"result"}, "Unique", {"x"}, {{"T", "$T"}, {"out_idx", "$out_idx"}}},
+      },
+      {{"y", "result:y:0"}, {"idx", "result:idx:0"}});
+}
+
 void FunctionTestSchedClosure(std::function<void()> fn) {
   static thread::ThreadPool* w =
       new thread::ThreadPool(Env::Default(), "Test", 8);
diff --git a/tensorflow/core/framework/function_testlib.h b/tensorflow/core/framework/function_testlib.h
index 85398e8a6e8..9893d1dbc8d 100644
--- a/tensorflow/core/framework/function_testlib.h
+++ b/tensorflow/core/framework/function_testlib.h
@@ -135,6 +135,9 @@ FunctionDef RandomUniformLess();
 // x:T -> y: TensorSliceDatasetOp::Dataset
 FunctionDef MakeTensorSliceDataset();
 
+// x:T -> y: T, idx: out_idx
+FunctionDef Unique();
+
 void FunctionTestSchedClosure(std::function<void()> fn);
 
 }  // end namespace function
diff --git a/tensorflow/core/framework/graph_to_functiondef.cc b/tensorflow/core/framework/graph_to_functiondef.cc
index d67a418fdbd..664e8e272cb 100644
--- a/tensorflow/core/framework/graph_to_functiondef.cc
+++ b/tensorflow/core/framework/graph_to_functiondef.cc
@@ -208,10 +208,18 @@ Status GraphToFunctionDef(const Graph& graph, const string& name,
       node_def->add_input(
           strings::StrCat(edge->src()->name(), ":", edge->src_output()));
     }
-
     // Add control inputs
+    std::vector<std::string> control_inputs;
+    control_inputs.reserve(control_edges.size());
     for (const Edge* edge : control_edges) {
-      node_def->add_input(strings::StrCat("^", edge->src()->name()));
+      control_inputs.push_back(strings::StrCat("^", edge->src()->name()));
+    }
+    // Sort the control inputs so that nodes that are semantically equivalent
+    // generate idential node_def.
+    std::sort(control_inputs.begin(), control_inputs.end());
+
+    for (const auto& input : control_inputs) {
+      node_def->add_input(input);
     }
 
     // Populate tensor_renaming.
diff --git a/tensorflow/core/framework/logging.cc b/tensorflow/core/framework/logging.cc
new file mode 100644
index 00000000000..7a819e7fb0c
--- /dev/null
+++ b/tensorflow/core/framework/logging.cc
@@ -0,0 +1,55 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/logging.h"
+
+#include <iostream>
+
+#include "tensorflow/core/lib/strings/str_util.h"
+
+namespace tensorflow {
+
+namespace logging {
+
+typedef std::vector<void (*)(const char*)> Listeners;
+
+Listeners* GetListeners() {
+  static Listeners* listeners = new Listeners;
+  return listeners;
+}
+
+bool RegisterListener(void (*listener)(const char*)) {
+  GetListeners()->push_back(listener);
+  return true;
+}
+
+bool LogToListeners(string msg, string end) {
+  auto listeners = logging::GetListeners();
+  if (listeners->empty()) {
+    return false;
+  }
+
+  string ended_msg = strings::StrCat(msg, end);
+
+  for (auto& listener : *listeners) {
+    listener(ended_msg.c_str());
+  }
+
+  return true;
+}
+
+}  // end namespace logging
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/logging_ops.h b/tensorflow/core/framework/logging.h
similarity index 75%
rename from tensorflow/core/kernels/logging_ops.h
rename to tensorflow/core/framework/logging.h
index 92a8d634094..9bde3d51056 100644
--- a/tensorflow/core/kernels/logging_ops.h
+++ b/tensorflow/core/framework/logging.h
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_LOGGING_OPS_H_
-#define TENSORFLOW_CORE_KERNELS_LOGGING_OPS_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_LOGGING_H_
+#define TENSORFLOW_CORE_FRAMEWORK_LOGGING_H_
 
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_split.h"
+#include <string>
 
 namespace tensorflow {
 
@@ -27,7 +26,12 @@ namespace logging {
 // Returns true if it is successfully registered.
 bool RegisterListener(void (*listener)(const char*));
 
+// Log string to active listeners. Returns true if any listeners were
+// registered.
+bool LogToListeners(std::string msg, std::string end = "\n");
+
 }  // namespace logging
+
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_LOGGING_OPS_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_LOGGING_H_
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index f373407ed00..349636241e8 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -295,6 +295,12 @@ class Node {
     return result;
   }
 
+  // Returns the per-element processing time spent in this node.
+  double SelfProcessingTime() const LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock l(mu_);
+    return SelfProcessingTimeLocked();
+  }
+
   // Returns the per-element CPU time spent in the subtree rooted in this node.
   double TotalProcessingTime() const LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
@@ -343,7 +349,7 @@ class Node {
     for (auto& input : inputs_) {
       // Inputs for which autotuning is disabled are excluded.
       if (input->autotune()) {
-        sum += input->SelfProcessingTimeLocked();
+        sum += input->SelfProcessingTime();
       }
     }
     return sum;
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index 8da2d8af95c..6c657c8461b 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -21,7 +21,6 @@ limitations under the License.
 
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/graph.pb_text.h"
-#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb_text.h"
 #include "tensorflow/core/framework/op_def_util.h"
@@ -50,7 +49,7 @@ AttrSlice::AttrSlice(const NodeDef& node_def)
 
 AttrSlice::AttrSlice(const AttrValueMap* a) : ndef_(nullptr), attrs_(a) {}
 
-static string SummarizeAttrsHelper(AttrSlice attrs, StringPiece device) {
+string SummarizeAttrsHelper(AttrSlice attrs, StringPiece device) {
   string ret;
 
   // We sort the attrs so the output is deterministic.
@@ -120,6 +119,13 @@ string FormatNodeDefForError(const NodeDef& node_def) {
   return FormatNodeForError(NodeDebugInfo(node_def));
 }
 
+string FormatNodeDefForError(
+    StringPiece node_name, bool has_experimental_debug_info,
+    const NodeDef_ExperimentalDebugInfo& experimental_debug_info) {
+  return FormatNodeForError(NodeDebugInfo(
+      node_name, has_experimental_debug_info, experimental_debug_info));
+}
+
 void GetMergedOriginalNodeNames(const NodeDebugInfo& from,
                                 const NodeDebugInfo& to,
                                 std::set<string>* names) {
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index 5e8b53d49ca..d85c53abfa2 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
@@ -34,6 +35,7 @@ struct NodeDebugInfo;
 // We forward declare protos so that kernels don't need to depend on them
 class NodeDef;
 class OpDef;
+class AttrSlice;
 
 // Name of the attribute used to encode node colocation constraints.
 //
@@ -50,12 +52,16 @@ extern const char* const kColocationGroupPrefix;
 string SummarizeNode(const Node& node);
 string SummarizeNodeDef(const NodeDef& node_def);
 string SummarizeAttrs(const NodeDef& node_def);
+string SummarizeAttrsHelper(AttrSlice attrs, StringPiece device);
 
 // Produces a formatted string pattern from the node which can uniquely identify
 // this node upstream to produce an informative error message. The pattern
 // followed is: {{node <node_name>}}
 string FormatNodeForError(const Node& node);
 string FormatNodeDefForError(const NodeDef& node_def);
+string FormatNodeDefForError(
+    StringPiece node_name, bool has_experimental_debug_info,
+    const NodeDef_ExperimentalDebugInfo& experimental_debug_info);
 
 // Merges the original node names from the debug information of 'from' to the
 // debug information of 'to'.
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index a03e2d8aa91..f84b3520eb3 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -101,10 +101,10 @@ OpKernel::OpKernel(OpKernelConstruction* context,
                     context->output_types().end()),
       output_memory_types_(context->output_memory_types().begin(),
                            context->output_memory_types().end()),
-      graph_def_version_(context->graph_def_version()),
-      is_internal_(str_util::StartsWith(type_string(), "_")),
       input_name_map_(context->num_inputs()),
       output_name_map_(context->num_outputs()),
+      graph_def_version_(context->graph_def_version()),
+      is_internal_(str_util::StartsWith(type_string(), "_")),
       cost_estimate_(OpKernel::kInitialCostEstimateCycles) {
   OP_REQUIRES_OK(context,
                  NameRangesForNode(*def_, *context->op_def_, &input_name_map_,
@@ -985,7 +985,10 @@ struct KernelRegistration {
 // This maps from 'op_type' + DeviceType to the set of KernelDefs and
 // factory functions for instantiating the OpKernel that matches the
 // KernelDef.
-typedef std::unordered_multimap<string, KernelRegistration> KernelRegistry;
+struct KernelRegistry {
+  mutex mu;
+  std::unordered_multimap<string, KernelRegistration> registry GUARDED_BY(mu);
+};
 
 #if defined(_WIN32)
 static const char kKernelLibPattern[] = "libtfkernel*.dll";
@@ -1113,9 +1116,12 @@ void OpKernelRegistrar::InitInternal(const KernelDef* kernel_def,
     // before some file libraries can initialize, which in turn crashes the
     // program flakily. Until we get rid of static initializers in kernel
     // registration mechanism, we have this workaround here.
-    reinterpret_cast<KernelRegistry*>(GlobalKernelRegistry())
-        ->emplace(key, KernelRegistration(*kernel_def, kernel_class_name,
-                                          std::move(factory)));
+    auto global_registry =
+        reinterpret_cast<KernelRegistry*>(GlobalKernelRegistry());
+    mutex_lock l(global_registry->mu);
+    global_registry->registry.emplace(
+        key,
+        KernelRegistration(*kernel_def, kernel_class_name, std::move(factory)));
   }
   delete kernel_def;
 }
@@ -1132,28 +1138,33 @@ namespace {
 static const StringPiece kKernelAttr("_kernel");
 
 // TODO(irving): Replace with const Node& version below.
-Status FindKernelRegistration(const DeviceType& device_type,
-                              const NodeDef& node_def,
-                              const KernelRegistration** reg,
-                              bool* was_attr_mismatch) {
+Status FindKernelRegistration(
+    const DeviceType& device_type, StringPiece node_name,
+    bool has_experimental_debug_info,
+    const NodeDef_ExperimentalDebugInfo& experimental_debug_info,
+    StringPiece node_op, AttrSlice node_attrs, const KernelRegistration** reg,
+    bool* was_attr_mismatch) {
   *reg = nullptr;
   *was_attr_mismatch = false;
   // Label defaults to empty if not found in NodeDef.
-  const string& label = GetNodeAttrString(node_def, kKernelAttr);
+  const string& label = GetNodeAttrString(node_attrs, kKernelAttr);
 
-  const string key = Key(node_def.op(), device_type, label);
-  auto regs = GlobalKernelRegistryTyped()->equal_range(key);
+  const string key = Key(node_op, device_type, label);
+  auto typed_registry = GlobalKernelRegistryTyped();
+  tf_shared_lock lock(typed_registry->mu);
+  auto regs = typed_registry->registry.equal_range(key);
   for (auto iter = regs.first; iter != regs.second; ++iter) {
     // If there is a kernel registered for the op and device_type,
     // check that the attrs match.
     bool match;
-    TF_RETURN_IF_ERROR(KernelAttrsMatch(iter->second.def, node_def, &match));
+    TF_RETURN_IF_ERROR(KernelAttrsMatch(iter->second.def, node_attrs, &match));
     if (match) {
       if (*reg != nullptr) {
         return errors::InvalidArgument(
             "Multiple OpKernel registrations match NodeDef '",
-            FormatNodeDefForError(node_def), "': '",
-            ProtoShortDebugString((*reg)->def), "' and '",
+            FormatNodeDefForError(node_name, has_experimental_debug_info,
+                                  experimental_debug_info),
+            "': '", ProtoShortDebugString((*reg)->def), "' and '",
             ProtoShortDebugString(iter->second.def), "'");
       }
       *reg = &iter->second;
@@ -1164,6 +1175,16 @@ Status FindKernelRegistration(const DeviceType& device_type,
   return Status::OK();
 }
 
+Status FindKernelRegistration(const DeviceType& device_type,
+                              const NodeDef& node_def,
+                              const KernelRegistration** reg,
+                              bool* was_attr_mismatch) {
+  return FindKernelRegistration(
+      device_type, node_def.name(), node_def.has_experimental_debug_info(),
+      node_def.experimental_debug_info(), node_def.op(),
+      AttrSlice(&node_def.attr()), reg, was_attr_mismatch);
+}
+
 }  // namespace
 
 bool KernelDefAvailable(const DeviceType& device_type,
@@ -1176,24 +1197,31 @@ bool KernelDefAvailable(const DeviceType& device_type,
 }
 
 // TODO(irving): Change const NodeDef& to const Node&
-Status FindKernelDef(const DeviceType& device_type, const NodeDef& node_def,
-                     const KernelDef** def, string* kernel_class_name) {
+Status FindKernelDef(
+    const DeviceType& device_type, StringPiece node_name,
+    bool has_experimental_debug_info,
+    const NodeDef_ExperimentalDebugInfo& experimental_debug_info,
+    StringPiece node_op, StringPiece node_device, AttrSlice node_attrs,
+    const KernelDef** def, string* kernel_class_name) {
   const KernelRegistration* reg = nullptr;
   bool was_attr_mismatch;
-  TF_RETURN_IF_ERROR(
-      FindKernelRegistration(device_type, node_def, &reg, &was_attr_mismatch));
+  TF_RETURN_IF_ERROR(FindKernelRegistration(
+      device_type, node_name, has_experimental_debug_info,
+      experimental_debug_info, node_op, node_attrs, &reg, &was_attr_mismatch));
   if (reg == nullptr) {
     Status s = errors::NotFound(
-        "No registered '", node_def.op(), "' OpKernel for ",
+        "No registered '", node_op, "' OpKernel for ",
         DeviceTypeString(device_type), " devices compatible with node ",
-        FormatNodeDefForError(node_def));
+        FormatNodeDefForError(node_name, has_experimental_debug_info,
+                              experimental_debug_info));
     if (was_attr_mismatch) {
       errors::AppendToMessage(
           &s, " (OpKernel was found, but attributes didn't match) ",
-          "Requested Attributes: ", SummarizeAttrs(node_def));
+          "Requested Attributes: ",
+          SummarizeAttrsHelper(node_attrs, node_device));
     }
-    errors::AppendToMessage(
-        &s, ".  Registered:", KernelsRegisteredForOp(node_def.op()));
+    errors::AppendToMessage(&s,
+                            ".  Registered:", KernelsRegisteredForOp(node_op));
     return s;
   }
   if (def != nullptr) *def = &reg->def;
@@ -1201,6 +1229,14 @@ Status FindKernelDef(const DeviceType& device_type, const NodeDef& node_def,
   return Status::OK();
 }
 
+Status FindKernelDef(const DeviceType& device_type, const NodeDef& node_def,
+                     const KernelDef** def, string* kernel_class_name) {
+  return FindKernelDef(
+      device_type, node_def.name(), node_def.has_experimental_debug_info(),
+      node_def.experimental_debug_info(), node_def.op(), node_def.device(),
+      AttrSlice(&node_def.attr()), def, kernel_class_name);
+}
+
 Status SupportedDeviceTypesForNode(
     const std::vector<DeviceType>& prioritized_types, const NodeDef& def,
     PrioritizedDeviceTypeVector* prioritized_device_types) {
@@ -1249,10 +1285,11 @@ KernelList GetAllRegisteredKernels() {
 
 KernelList GetFilteredRegisteredKernels(
     const std::function<bool(const KernelDef&)>& predicate) {
-  const KernelRegistry* const typed_registry = GlobalKernelRegistryTyped();
+  KernelRegistry* const typed_registry = GlobalKernelRegistryTyped();
   KernelList kernel_list;
-  kernel_list.mutable_kernel()->Reserve(typed_registry->size());
-  for (const auto& p : *typed_registry) {
+  tf_shared_lock lock(typed_registry->mu);
+  kernel_list.mutable_kernel()->Reserve(typed_registry->registry.size());
+  for (const auto& p : typed_registry->registry) {
     const KernelDef& kernel_def = p.second.def;
     if (predicate(kernel_def)) {
       *kernel_list.add_kernel() = kernel_def;
@@ -1378,7 +1415,9 @@ bool FindArgInOp(StringPiece arg_name,
 }  // namespace
 
 Status ValidateKernelRegistrations(const OpRegistryInterface& op_registry) {
-  for (const auto& key_registration : *GlobalKernelRegistryTyped()) {
+  auto typed_registry = GlobalKernelRegistryTyped();
+  tf_shared_lock lock(typed_registry->mu);
+  for (const auto& key_registration : typed_registry->registry) {
     const KernelDef& kernel_def(key_registration.second.def);
     const OpRegistrationData* op_reg_data;
     const Status status = op_registry.LookUp(kernel_def.op(), &op_reg_data);
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index f851a2fcc08..20702a49147 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include <atomic>
 #include <functional>
-
 #include <utility>
 #include <vector>
+
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/control_flow.h"
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/kernel_def.pb.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"  // TODO(b/62899350): Remove
 #include "tensorflow/core/framework/rendezvous.h"
@@ -66,6 +67,7 @@ class TensorSliceReaderCacheWrapper;
 
 class AsyncOpKernel;
 class CallFrameInterface;
+class DeviceMgr;
 class FunctionLibraryRuntime;
 class OpKernelConstruction;  // declared below
 class OpKernelContext;       // declared below,
@@ -213,10 +215,10 @@ class OpKernel {
   const MemoryTypeVector input_memory_types_;
   const DataTypeVector output_types_;
   const MemoryTypeVector output_memory_types_;
-  const int graph_def_version_;
-  const bool is_internal_;  // True if this is an internal operation
   NameRangeMap input_name_map_;
   NameRangeMap output_name_map_;
+  const int graph_def_version_;
+  const bool is_internal_;  // True if this is an internal operation
   bool expensive_;
   std::atomic_uint_fast64_t cost_estimate_;
 
@@ -647,6 +649,8 @@ class OpKernelContext {
     // Mechanism used by this op kernel invocation to communicate with
     // computations running on other devices.
     Rendezvous* rendezvous = nullptr;
+    const std::function<Status(const int64, const DeviceMgr*, Rendezvous** r)>*
+        create_rendezvous;
 
     // Mechanism for executing a collective op that needs to coordinate
     // with parallel instances running on other devices.
@@ -1082,6 +1086,10 @@ class OpKernelContext {
   // An op kernel communicates with outside environment through
   // Rendezvous Send() and Recv().
   Rendezvous* rendezvous() const { return params_->rendezvous; }
+  Status create_rendezvous(const int64 step_id, const DeviceMgr* device_mgr,
+                           Rendezvous** r) const {
+    return (*params_->create_rendezvous)(step_id, device_mgr, r);
+  }
 
   CollectiveExecutor* collective_executor() const {
     return params_->collective_executor;
@@ -1436,6 +1444,17 @@ class Name : public KernelDefBuilder {
 // Checks whether a given kernel is registered on device_type.
 bool KernelDefAvailable(const DeviceType& device_type, const NodeDef& node_def);
 
+// If node of node_name, experimental_debug_info, node_op, node_device and
+// node_attrs has a corresponding kernel registered on device_type, returns OK
+// and fill in the kernel def and kernel_class_name. <def> and
+// <kernel_class_name> may be null.
+Status FindKernelDef(
+    const DeviceType& device_type, StringPiece node_name,
+    bool has_experimental_debug_info,
+    const NodeDef_ExperimentalDebugInfo& experimental_debug_info,
+    StringPiece node_op, StringPiece node_device, AttrSlice node_attrs,
+    const KernelDef** def, string* kernel_class_name);
+
 // If node_def has a corresponding kernel registered on device_type,
 // returns OK and fill in the kernel def and kernel_class_name. <def> and
 // <kernel_class_name> may be null.
diff --git a/tensorflow/core/framework/resource_mgr.cc b/tensorflow/core/framework/resource_mgr.cc
index 6a94ff6642e..7e028d3f394 100644
--- a/tensorflow/core/framework/resource_mgr.cc
+++ b/tensorflow/core/framework/resource_mgr.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <atomic>
-
 #include "tensorflow/core/framework/resource_mgr.h"
 
+#include <atomic>
+
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -104,6 +104,21 @@ ResourceMgr::ResourceMgr(const string& default_container)
 
 ResourceMgr::~ResourceMgr() { Clear(); }
 
+void ResourceMgr::GetContainerResources(
+    const string& container, std::vector<ResourceEntry>* resources) const {
+  resources->clear();
+  mutex_lock l(mu_);
+  Container* b = gtl::FindPtrOrNull(containers_, container);
+  if (b != nullptr) {
+    resources->reserve(b->size());
+    for (auto& key_resource : *b) {
+      ResourceBase* resource = key_resource.second;
+      resource->Ref();
+      resources->emplace_back(key_resource.first.second, resource);
+    }
+  }
+}
+
 void ResourceMgr::Clear() {
   mutex_lock l(mu_);
   for (const auto& p : containers_) {
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index da547d5829f..61f60e8726b 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -145,6 +145,19 @@ class ResourceMgr {
                     std::vector<std::unique_ptr<T, core::RefCountDeleter>>*
                         resources) const TF_MUST_USE_RESULT;
 
+  // Retrieves all the resources within a container. If the container does not
+  // exist, it will not be created and the result vector will be empty. The
+  // resource member of the returned ResourceEntry data structures will own
+  // a reference to the ResourceBase object(s).
+  struct ResourceEntry {
+    ResourceEntry(string name, ResourceBase* resource)
+        : name(std::move(name)), resource(resource) {}
+    string name;
+    std::unique_ptr<ResourceBase, core::RefCountDeleter> resource;
+  };
+  void GetContainerResources(const string& container,
+                             std::vector<ResourceEntry>* resources) const;
+
   // If "container" has a resource "name", returns it in
   // "*resource". Otherwise, invokes creator() to create the resource.
   // The caller takes the ownership of one ref on "*resource".
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index e97b3c5fa79..1bd2a43dc98 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_description.pb.h"
 #include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/framework/typed_allocator.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
@@ -443,12 +444,14 @@ struct ProtoHelper<Eigen::half> {
 
 template <typename T>
 Buffer<T>::Buffer(Allocator* a, int64 n)
-    : BufferBase(a, a->Allocate<T>(n)), elem_(n) {}
+    : BufferBase(a, TypedAllocator::Allocate<T>(a, n, AllocationAttributes())),
+      elem_(n) {}
 
 template <typename T>
 Buffer<T>::Buffer(Allocator* a, int64 n,
                   const AllocationAttributes& allocation_attr)
-    : BufferBase(a, a->Allocate<T>(n, allocation_attr)), elem_(n) {}
+    : BufferBase(a, TypedAllocator::Allocate<T>(a, n, allocation_attr)),
+      elem_(n) {}
 
 template <typename T>
 Buffer<T>::~Buffer() {
@@ -456,7 +459,7 @@ Buffer<T>::~Buffer() {
     if (LogMemory::IsEnabled()) {
       RecordDeallocation();
     }
-    alloc_->Deallocate<T>(static_cast<T*>(data()), elem_);
+    TypedAllocator::Deallocate<T>(alloc_, static_cast<T*>(data()), elem_);
   }
 }
 
@@ -734,7 +737,7 @@ Tensor::Tensor(Allocator* a, DataType type, const TensorShape& shape)
     : shape_(shape), buf_(nullptr) {
   set_dtype(type);
   CHECK_NOTNULL(a);
-  if (shape_.num_elements() > 0 || a->ShouldAllocateEmptyTensors()) {
+  if (shape_.num_elements() > 0 || a->AllocatesOpaqueHandle()) {
     CASES(type, buf_ = new Buffer<T>(a, shape.num_elements()));
   }
   if (buf_ != nullptr && buf_->data() != nullptr && LogMemory::IsEnabled()) {
@@ -748,7 +751,7 @@ Tensor::Tensor(Allocator* a, DataType type, const TensorShape& shape,
     : shape_(shape), buf_(nullptr) {
   set_dtype(type);
   CHECK_NOTNULL(a);
-  if (shape_.num_elements() > 0 || a->ShouldAllocateEmptyTensors()) {
+  if (shape_.num_elements() > 0 || a->AllocatesOpaqueHandle()) {
     CASES(type, buf_ = new Buffer<T>(a, shape.num_elements(), allocation_attr));
   }
   if (!allocation_attr.allocation_will_be_logged && buf_ != nullptr &&
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index 6454cb818f2..edbdc29db0c 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -43,6 +43,7 @@ class OpKernelContext;
 class Tensor;
 class TensorBuffer;
 class TensorCApi;
+class TensorCord;
 class TensorDescription;
 class TensorProto;
 class Var;
@@ -237,7 +238,8 @@ class Tensor {
     return true;
 #else
     void* ptr = base<void>();
-    return reinterpret_cast<intptr_t>(ptr) % EIGEN_MAX_ALIGN_BYTES == 0;
+    return dtype() == DT_STRING ||
+           (reinterpret_cast<intptr_t>(ptr) % EIGEN_MAX_ALIGN_BYTES == 0);
 #endif
   }
 
@@ -606,6 +608,7 @@ class Tensor {
 
   friend class DMAHelper;
   friend class TensorCApi;
+  friend class TensorCord;            // For access to buf_
   friend class TensorReference;       // For access to buf_
   friend class VariableOp;            // For access to set_shape
   friend class AutoReloadVariableOp;  // For access to set_shape
diff --git a/tensorflow/core/framework/tracking_allocator.h b/tensorflow/core/framework/tracking_allocator.h
index acec847bd36..428bffd9e15 100644
--- a/tensorflow/core/framework/tracking_allocator.h
+++ b/tensorflow/core/framework/tracking_allocator.h
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include <unordered_map>
 #include "tensorflow/core/framework/allocator.h"
-#include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
diff --git a/tensorflow/core/framework/typed_allocator.cc b/tensorflow/core/framework/typed_allocator.cc
new file mode 100644
index 00000000000..25f15e54dca
--- /dev/null
+++ b/tensorflow/core/framework/typed_allocator.cc
@@ -0,0 +1,32 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/typed_allocator.h"
+
+#include "tensorflow/core/framework/variant.h"
+
+namespace tensorflow {
+
+/* static */
+void TypedAllocator::RunVariantCtor(Variant* p, size_t n) {
+  for (size_t i = 0; i < n; ++p, ++i) new (p) Variant();
+}
+
+/* static */
+void TypedAllocator::RunVariantDtor(Variant* p, size_t n) {
+  for (size_t i = 0; i < n; ++p, ++i) p->~Variant();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/typed_allocator.h b/tensorflow/core/framework/typed_allocator.h
new file mode 100644
index 00000000000..7e1ea1bfae5
--- /dev/null
+++ b/tensorflow/core/framework/typed_allocator.h
@@ -0,0 +1,134 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_TYPED_ALLOCATOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_TYPED_ALLOCATOR_H_
+
+#include <limits>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/resource_handle.h"
+#include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class Variant;
+
+// Convenience functions to do typed allocation.  C++ constructors
+// and destructors are invoked for complex types if necessary.
+class TypedAllocator {
+ public:
+  // May return NULL if the tensor has too many elements to represent in a
+  // single allocation.
+  template <typename T>
+  static T* Allocate(Allocator* raw_allocator, size_t num_elements,
+                     const AllocationAttributes& allocation_attr) {
+    // TODO(jeff): Do we need to allow clients to pass in alignment
+    // requirements?
+
+    if (num_elements > (std::numeric_limits<size_t>::max() / sizeof(T))) {
+      return nullptr;
+    }
+
+    void* p =
+        raw_allocator->AllocateRaw(Allocator::kAllocatorAlignment,
+                                   sizeof(T) * num_elements, allocation_attr);
+    T* typed_p = reinterpret_cast<T*>(p);
+    if (typed_p) RunCtor<T>(raw_allocator, typed_p, num_elements);
+    return typed_p;
+  }
+
+  template <typename T>
+  static void Deallocate(Allocator* raw_allocator, T* ptr,
+                         size_t num_elements) {
+    if (ptr) {
+      RunDtor<T>(raw_allocator, ptr, num_elements);
+      raw_allocator->DeallocateRaw(ptr);
+    }
+  }
+
+ private:
+  // No constructors or destructors are run for simple types
+  template <typename T>
+  static void RunCtor(Allocator* raw_allocator, T* p, size_t n) {
+    static_assert(is_simple_type<T>::value, "T is not a simple type.");
+  }
+
+  template <typename T>
+  static void RunDtor(Allocator* raw_allocator, T* p, size_t n) {}
+
+  static void RunVariantCtor(Variant* p, size_t n);
+
+  static void RunVariantDtor(Variant* p, size_t n);
+};
+
+template <>
+/* static */
+inline void TypedAllocator::RunCtor(Allocator* raw_allocator, string* p,
+                                    size_t n) {
+  if (!raw_allocator->AllocatesOpaqueHandle()) {
+    for (size_t i = 0; i < n; ++p, ++i) new (p) string();
+  }
+}
+
+template <>
+/* static */
+inline void TypedAllocator::RunDtor(Allocator* raw_allocator, string* p,
+                                    size_t n) {
+  if (!raw_allocator->AllocatesOpaqueHandle()) {
+    for (size_t i = 0; i < n; ++p, ++i) p->~string();
+  }
+}
+
+template <>
+/* static */
+inline void TypedAllocator::RunCtor(Allocator* raw_allocator, ResourceHandle* p,
+                                    size_t n) {
+  if (!raw_allocator->AllocatesOpaqueHandle()) {
+    for (size_t i = 0; i < n; ++p, ++i) new (p) ResourceHandle();
+  }
+}
+
+template <>
+/* static */
+inline void TypedAllocator::RunDtor(Allocator* raw_allocator, ResourceHandle* p,
+                                    size_t n) {
+  if (!raw_allocator->AllocatesOpaqueHandle()) {
+    for (size_t i = 0; i < n; ++p, ++i) p->~ResourceHandle();
+  }
+}
+
+template <>
+/* static */
+inline void TypedAllocator::RunCtor(Allocator* raw_allocator, Variant* p,
+                                    size_t n) {
+  if (!raw_allocator->AllocatesOpaqueHandle()) {
+    RunVariantCtor(p, n);
+  }
+}
+
+template <>
+/* static */
+inline void TypedAllocator::RunDtor(Allocator* raw_allocator, Variant* p,
+                                    size_t n) {
+  if (!raw_allocator->AllocatesOpaqueHandle()) {
+    RunVariantDtor(p, n);
+  }
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_TYPED_ALLOCATOR_H_
diff --git a/tensorflow/core/framework/variant.cc b/tensorflow/core/framework/variant.cc
index d43e3c72ece..e61afeada90 100644
--- a/tensorflow/core/framework/variant.cc
+++ b/tensorflow/core/framework/variant.cc
@@ -23,9 +23,11 @@ limitations under the License.
 
 namespace tensorflow {
 
+Variant::~Variant() { clear(); }
+
 bool Variant::Decode(VariantTensorData data) {
   if (!is_empty()) {
-    return value_->Decode(std::move(data));
+    return GetValue()->Decode(std::move(data));
   }
   return true;
 }
@@ -35,7 +37,7 @@ void* Variant::get() {
   if (is_empty()) {
     return nullptr;
   }
-  return value_->RawPtr();
+  return GetValue()->RawPtr();
 }
 
 template <>
@@ -43,7 +45,7 @@ const void* Variant::get() const {
   if (is_empty()) {
     return nullptr;
   }
-  return value_->RawPtr();
+  return GetValue()->RawPtr();
 }
 
 template <>
diff --git a/tensorflow/core/framework/variant.h b/tensorflow/core/framework/variant.h
index 10eabbc85fd..9b4ee869618 100644
--- a/tensorflow/core/framework/variant.h
+++ b/tensorflow/core/framework/variant.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <unordered_map>
 #include <utility>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/core/framework/type_index.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -68,7 +69,7 @@ void EncodeVariant(const T& value, string* buf);
 //
 //   string TypeName() const;
 //   void Encode(VariantTensorData* data) const;
-//   void Decode(VariantTensorData data);
+//   bool Decode(VariantTensorData data);
 //
 // Simple POD types can elide the Encode/Decode functions, they are provided by
 // helper methods.
@@ -149,39 +150,57 @@ void EncodeVariant(const T& value, string* buf);
 //
 class Variant {
  public:
-  constexpr Variant() noexcept = default;
+  Variant() noexcept : is_inline_(false) {}
 
-  Variant(const Variant& other)
-      : value_(other.is_empty() ? std::unique_ptr<ValueInterface>()
-                                : other.value_->Clone()) {}
+  ~Variant();
 
-  Variant(Variant&& other) noexcept = default;
+  Variant(const Variant& other);
+  Variant(Variant&& other) noexcept;
+
+  // Make sure that the type is CopyConstructible and not a
+  // tensorflow::Variant object itself. We want the copy constructor to be
+  // chosen for the tensorflow::Variant case.
+  template <typename T, typename VT = typename std::decay<T>::type,
+            typename std::enable_if<!std::is_same<Variant, VT>::value &&
+                                        std::is_move_constructible<VT>::value,
+                                    void>::type* = nullptr>
+  Variant(T&& value);
 
-  // Make sure that the type is CopyConstructible and not a tensorflow::Variant
-  // object itself. We want the copy constructor to be chosen for the
-  // tensorflow::Variant case.
   template <typename T, typename VT = typename std::decay<T>::type,
             typename std::enable_if<!std::is_same<Variant, VT>::value &&
                                         std::is_copy_constructible<VT>::value,
                                     void>::type* = nullptr>
-  Variant(T&& value)  // NOLINT
-      : value_(new Value<VT>(in_place, std::forward<T>(value))) {}
+  Variant(const T& value);
+
+  template <typename T, typename VT = typename std::decay<T>::type,
+            typename std::enable_if<!std::is_same<Variant, VT>::value &&
+                                        std::is_copy_constructible<VT>::value,
+                                    void>::type* = nullptr>
+  Variant& operator=(const T& value);
+
+  template <typename T, typename VT = typename std::decay<T>::type,
+            typename std::enable_if<!std::is_same<Variant, VT>::value &&
+                                        std::is_move_constructible<VT>::value,
+                                    void>::type* = nullptr>
+  Variant& operator=(T&& value);
 
   Variant& operator=(const Variant& rhs) {
+    if (&rhs == this) return *this;
     Variant(rhs).swap(*this);
     return *this;
   }
 
   Variant& operator=(Variant&& rhs) noexcept {
+    if (&rhs == this) return *this;
     Variant(std::move(rhs)).swap(*this);
     return *this;
   }
 
-  bool is_empty() const { return value_ == nullptr; }
+  bool is_empty() const { return GetValue() == nullptr; }
 
-  void clear() noexcept { value_.reset(); }
+  void clear() noexcept;
 
-  void swap(Variant& other) noexcept { value_.swap(other.value_); }
+  void swap(Variant& other) noexcept;
 
   // Note, unlike TypeName(), TypeId() does not return the TypeIndex
   // of the original type when a TensorValueDataProto is stored as the
@@ -191,12 +210,13 @@ class Variant {
     if (is_empty()) {
       return VoidTypeIndex;
     }
-    return value_->TypeId();
+    return GetValue()->TypeId();
   }
 
   string DebugString() const {
-    return strings::StrCat("Variant<type: ", TypeName(),
-                           " value: ", value_->DebugString(), ">");
+    return strings::StrCat(
+        "Variant<type: ", TypeName(),
+        " value: ", is_empty() ? "[empty]" : GetValue()->DebugString(), ">");
   }
 
   // Returns a pointer to the stored value if it is type T, or nullptr
@@ -205,7 +225,7 @@ class Variant {
   T* get() {
     const TypeIndex TTypeIndex = MakeTypeIndex<T>();
     if (is_empty() || (TTypeIndex != TypeId())) return nullptr;
-    return std::addressof(static_cast<Variant::Value<T>*>(value_.get())->value);
+    return std::addressof(static_cast<Variant::Value<T>*>(GetValue())->value);
   }
 
   // Returns a pointer to the stored value if it is type T, or nullptr
@@ -215,7 +235,7 @@ class Variant {
     const TypeIndex TTypeIndex = MakeTypeIndex<T>();
     if (is_empty() || (TTypeIndex != TypeId())) return nullptr;
     return std::addressof(
-        static_cast<const Variant::Value<T>*>(value_.get())->value);
+        static_cast<const Variant::Value<T>*>(GetValue())->value);
   }
 
   // Returns TypeNameVariant(value).
@@ -227,13 +247,13 @@ class Variant {
     if (is_empty()) {
       return "";
     }
-    return value_->TypeName();
+    return GetValue()->TypeName();
   }
 
   // Serialize the contents of the stored object into `data`.
   void Encode(VariantTensorData* data) const {
     if (!is_empty()) {
-      value_->Encode(data);
+      GetValue()->Encode(data);
     }
   }
 
@@ -243,26 +263,36 @@ class Variant {
   // Helper methods to directly serialize/deserialize from strings.
   void Encode(string* buf) const {
     if (!is_empty()) {
-      value_->Encode(buf);
+      GetValue()->Encode(buf);
     }
   }
   bool Decode(string buf) {
     if (!is_empty()) {
-      return value_->Decode(std::move(buf));
+      return GetValue()->Decode(std::move(buf));
     }
     return true;
   }
 
+  template <typename VT>
+  static constexpr bool CanInlineType() {
+    return ((sizeof(Value<VT>) <= InlineValue::kMaxValueSize) &&
+            (alignof(Value<VT>) <= kMaxInlineValueAlignSize));
+  }
+
  private:
   struct in_place_t {};
-  static constexpr in_place_t in_place{};
+  static constexpr in_place_t kInPlace{};
 
   struct ValueInterface {
     virtual ~ValueInterface() = default;
     virtual TypeIndex TypeId() const = 0;
     virtual void* RawPtr() = 0;
     virtual const void* RawPtr() const = 0;
-    virtual std::unique_ptr<ValueInterface> Clone() const = 0;
+    virtual ValueInterface* Clone() const = 0;
+    virtual void CloneInto(ValueInterface* memory) const = 0;
+    virtual void Swap(ValueInterface* memory) = 0;
+    virtual void MoveAssign(ValueInterface* memory) = 0;
+    virtual void MoveInto(ValueInterface* memory) = 0;
     virtual string TypeName() const = 0;
     virtual string DebugString() const = 0;
     virtual void Encode(VariantTensorData* data) const = 0;
@@ -277,6 +307,10 @@ class Variant {
     explicit Value(in_place_t /*tag*/, Args&&... args)
         : value(std::forward<Args>(args)...) {}
 
+    // NOTE(ebrevdo): Destructor must be explicitly defined for CUDA to happily
+    // build `alignof(Variant<void*>)`.
+    ~Value() final = default;
+
     TypeIndex TypeId() const override {
       const TypeIndex value_type_index =
           MakeTypeIndex<typename std::decay<T>::type>();
@@ -287,8 +321,33 @@ class Variant {
 
     const void* RawPtr() const override { return &value; }
 
-    std::unique_ptr<ValueInterface> Clone() const override {
-      return std::unique_ptr<ValueInterface>(new Value(in_place, value));
+    ValueInterface* Clone() const override {
+      // NOTE: Use placement new here because we override `operator delete`,
+      // and need to match the call to `port::Free()` with a call to
+      // `port::Malloc()`.
+      auto* clone = static_cast<Value*>(port::Malloc(sizeof(Value)));
+      new (clone) Value(kInPlace, value);
+      return clone;
+    }
+
+    void MoveAssign(ValueInterface* memory) override {
+      CHECK(TypeId() == memory->TypeId())
+          << TypeId().name() << " vs. " << memory->TypeId().name();
+      static_cast<Value*>(memory)->value = std::move(value);
+    }
+
+    void CloneInto(ValueInterface* memory) const override {
+      new (memory) Value(kInPlace, value);
+    }
+
+    void MoveInto(ValueInterface* memory) override {
+      new (memory) Value(kInPlace, std::move(value));
+    }
+
+    void Swap(ValueInterface* memory) override {
+      CHECK(TypeId() == memory->TypeId())
+          << TypeId().name() << " vs. " << memory->TypeId().name();
+      std::swap(value, static_cast<Value*>(memory)->value);
     }
 
     string TypeName() const override { return TypeNameVariant(value); }
@@ -307,14 +366,363 @@ class Variant {
 
     bool Decode(string buf) override { return DecodeVariant(&buf, &value); }
 
+    // We override operator delete in order to selectively free memory
+    // depending on if Value<VT> is stored inline or on the heap:
+    //
+    // Value<VT> is stored inline if its size <= InlineValue::kMaxValueSize and
+    // its alignment <= kMaxInlineValueAlignSize.  This check is performed by
+    // CanInlineType<VT>().
+    //
+    // We only need to call its destructor in this case and then overwrite
+    // the inline memory with zeros.  Variant::clear() does this.
+    // Thus, in the inline case, the delete operator does nothing (calling
+    // delete on the memory location calls the destructor only).
+    //
+    // If !CanInlineType<VT>(), then it is stored as a pointer inside HeapValue.
+    // The memory buffer it resides in on the heap was allocated with
+    // port::Malloc, and it should be deallocated via port::Free.
+    //
+    // operator delete is stored in the vtable since ~ValueInterface is a
+    // virtual destructor; furthermore it has access to VT and can calculate
+    // CanInlineType<VT>().
+    static void operator delete(void* ptr);
+
+    static void operator delete(void*, void*) {
+      // Some compilers require an overridden class-specific deallocation
+      // function, which will be called if placement `new` throws an
+      // exception.
+    }
+
     T value;
   };
+  static constexpr int kMaxInlineValueAlignSize = alignof(Value<void*>);
+
+  using HeapValue = std::unique_ptr<ValueInterface>;
+
+  struct InlineValue {
+    // We try to size InlineValue so that sizeof(Variant) <= 64 and it can fit
+    // into the aligned space of a TensorBuffer.
+    static constexpr int kMaxValueSize = (64 - /*some extra padding=*/16);
+
+    typedef char ValueDataArray[kMaxValueSize];
+    alignas(kMaxInlineValueAlignSize) ValueDataArray value_data;
+    bool has_value = false;
+
+    explicit InlineValue() {}
+
+    InlineValue(const InlineValue& other) noexcept
+        : has_value(other.has_value) {
+      if (other.has_value) {
+        other.AsValueInterface()->CloneInto(AsValueInterface());
+      }
+    }
+
+    InlineValue(InlineValue&& other) noexcept : has_value(other.has_value) {
+      if (other.has_value) {
+        other.AsValueInterface()->MoveInto(AsValueInterface());
+        other.Cleanup();
+      }
+    }
+
+    void Cleanup() {
+      // **NOTE** This must be a no-op if the memory representation of
+      // InlineValue is all zeros, in order to properly interact with
+      // HeapOrInline::ResetMemory().
+      if (has_value) {
+        // This doesn't actually delete anything on the heap; the delete
+        // operator of Value<VT> is overridden to do nothing for inline
+        // values; the side-effect of delete is that the virtual destructor is
+        // called.
+        //
+        // We leave it to callers to overwrite the data buffer in value_data
+        // with new objects.
+        delete AsValueInterface();
+      }
+      has_value = false;
+    }
+
+    InlineValue& operator=(const InlineValue& other) {
+      if (&other == this) return *this;
+      Cleanup();
+      if (other.has_value) {
+        other.AsValueInterface()->CloneInto(AsValueInterface());
+      }
+      has_value = other.has_value;
+      return *this;
+    }
+
+    InlineValue& operator=(InlineValue&& other) {
+      if (&other == this) return *this;
+      if (other.has_value) {
+        if (has_value && AsValueInterface()->TypeId() ==
+                             other.AsValueInterface()->TypeId()) {
+          other.AsValueInterface()->Swap(AsValueInterface());
+        } else {
+          if (has_value) {
+            if (AsValueInterface()->TypeId() !=
+                other.AsValueInterface()->TypeId()) {
+              Cleanup();
+              other.AsValueInterface()->MoveInto(AsValueInterface());
+            } else {
+              other.AsValueInterface()->MoveAssign(AsValueInterface());
+            }
+          } else {
+            other.AsValueInterface()->MoveInto(AsValueInterface());
+          }
+          other.Cleanup();
+          has_value = true;
+        }
+      } else {
+        Cleanup();
+      }
+      return *this;
+    }
+
+    ValueInterface* AsValueInterface() {
+      return reinterpret_cast<ValueInterface*>(value_data);
+    }
+
+    const ValueInterface* AsValueInterface() const {
+      return reinterpret_cast<const ValueInterface*>(value_data);
+    }
+
+    // **WARNING** This must be a no-op when the byte-representation of
+    // InlineValue is all zeros.
+    ~InlineValue() { Cleanup(); }
+  };
 
   // value_ can point to any type T as wrapped by a ValueInterface.
   // The only real requirement is that T is default-constructible.
-  std::unique_ptr<ValueInterface> value_;
+  union HeapOrInline {
+    HeapOrInline() { ResetMemory(); }
+    explicit HeapOrInline(HeapValue&& v) : heap_value(std::move(v)) {}
+    explicit HeapOrInline(InlineValue&& v) : inline_value(std::move(v)) {}
+    ~HeapOrInline() {}  // Taken care of by owner.
+
+    // This must be called when modifying which element of HeapOrInline is
+    // being used, because the destructor of the new class may be called
+    // while the memory is still a representation of the old class.
+    // **WARNING** This code assumes that the destructors of HeapValue and
+    // InlineValue are no-ops when the internal representation is zeros.
+    //
+    // Example of when this is needed:
+    //   value.heap_value = HeapValue(...);
+    //   // Segfault.  This calls InlineValue::Cleanup on value.inline_value
+    //   // but the internal memory representation is that of HeapValue.
+    //   value.inline_value = InlineValue();
+    //
+    //   The correct way to do this:
+    //   value.heap_value = HeapValue(...);
+    //   value.ResetMemory();
+    //   value.inline_value = InlineValue();
+    void ResetMemory();
+
+    HeapValue heap_value;
+    InlineValue inline_value;
+  } value_;
+  bool is_inline_;
+
+  bool IsInlineValue() const { return is_inline_; }
+
+  ValueInterface* GetValue() {
+    if (IsInlineValue()) {
+      return value_.inline_value.AsValueInterface();
+    } else {
+      return value_.heap_value.get();
+    }
+  }
+
+  const ValueInterface* GetValue() const {
+    if (IsInlineValue()) {
+      return value_.inline_value.AsValueInterface();
+    } else {
+      return value_.heap_value.get();
+    }
+  }
+
+  // PRECONDITION: Called on construction or clear() has been called before
+  // this method.
+  template <typename T, typename VT>
+  void InsertValueMove(T&& value) {
+    if (is_inline_) {
+      Value<VT>* inline_value_data =
+          reinterpret_cast<Value<VT>*>(value_.inline_value.value_data);
+      new (inline_value_data) Value<VT>(kInPlace, std::forward<T>(value));
+      value_.inline_value.has_value = true;
+    } else {
+      auto* moved = static_cast<Value<VT>*>(port::Malloc(sizeof(Value<VT>)));
+      new (moved) Value<VT>(kInPlace, std::forward<T>(value));
+      value_.heap_value = HeapValue(moved);
+    }
+  }
+
+  // PRECONDITION: Called on construction or clear() has been called before
+  // this method.
+  template <typename T, typename VT>
+  void InsertValueCopy(const T& value) {
+    if (is_inline_) {
+      Value<VT>* inline_value_data =
+          reinterpret_cast<Value<VT>*>(value_.inline_value.value_data);
+      new (inline_value_data) Value<VT>(kInPlace, value);
+      value_.inline_value.has_value = true;
+    } else {
+      auto* moved = static_cast<Value<VT>*>(port::Malloc(sizeof(Value<VT>)));
+      new (moved) Value<VT>(kInPlace, value);
+      value_.heap_value = HeapValue(moved);
+    }
+  }
 };
 
+// Make sure that a Variant object can reside in a 64-byte aligned Tensor
+// buffer.
+static_assert(sizeof(Variant) <= 64,
+              "Expected internal representation to be 64 bytes.");
+
+inline Variant::Variant(const Variant& other) : is_inline_(other.is_inline_) {
+  if (!other.is_empty()) {
+    if (other.IsInlineValue()) {
+      value_.inline_value = InlineValue();
+      other.GetValue()->CloneInto(GetValue());
+      value_.inline_value.has_value = true;
+    } else {
+      value_.heap_value = HeapValue(other.GetValue()->Clone());
+      is_inline_ = false;
+    }
+  }
+}
+
+inline Variant::Variant(Variant&& other) noexcept
+    : is_inline_(other.is_inline_) {
+  if (!other.is_empty()) {
+    if (other.IsInlineValue()) {
+      value_.inline_value = InlineValue();
+      other.GetValue()->MoveInto(GetValue());
+      value_.inline_value.has_value = true;
+    } else {
+      value_.heap_value = std::move(other.value_.heap_value);
+      is_inline_ = false;
+    }
+  }
+}
+
+template <typename VT>
+void Variant::Value<VT>::operator delete(void* ptr) {
+  if (!CanInlineType<VT>()) port::Free(ptr);
+}
+
+template <typename T, typename VT,
+          typename std::enable_if<!std::is_same<Variant, VT>::value &&
+                                      std::is_move_constructible<VT>::value,
+                                  void>::type*>
+inline Variant::Variant(T&& value) : is_inline_(CanInlineType<VT>()) {
+  InsertValueMove<T, VT>(std::forward<T>(value));
+}
+
+template <typename T, typename VT,
+          typename std::enable_if<!std::is_same<Variant, VT>::value &&
+                                      std::is_copy_constructible<VT>::value,
+                                  void>::type*>
+inline Variant::Variant(const T& value) : is_inline_(CanInlineType<VT>()) {
+  InsertValueCopy<T, VT>(value);
+}
+
+template <typename T, typename VT,
+          typename std::enable_if<!std::is_same<Variant, VT>::value &&
+                                      std::is_move_constructible<VT>::value,
+                                  void>::type*>
+inline Variant& Variant::operator=(T&& value) {
+  clear();
+  is_inline_ = CanInlineType<VT>();
+  InsertValueMove<T, VT>(std::forward<T>(value));
+  return *this;
+}
+
+template <typename T, typename VT,
+          typename std::enable_if<!std::is_same<Variant, VT>::value &&
+                                      std::is_copy_constructible<VT>::value,
+                                  void>::type*>
+inline Variant& Variant::operator=(const T& value) {
+  clear();
+  is_inline_ = CanInlineType<VT>();
+  InsertValueCopy<T, VT>(value);
+  return *this;
+}
+
+inline void Variant::HeapOrInline::ResetMemory() {
+  memset(  // NOLINT: not TriviallyCopyable
+      this, 0, sizeof(Variant::HeapOrInline));
+}
+
+inline void Variant::clear() noexcept {
+  if (!is_empty()) {
+    if (IsInlineValue()) {
+      value_.inline_value.~InlineValue();
+    } else {
+      value_.heap_value.~HeapValue();
+    }
+    value_.ResetMemory();
+  }
+  is_inline_ = false;
+}
+
+inline void Variant::swap(Variant& other) noexcept {
+  if (is_empty()) {
+    if (other.IsInlineValue()) {
+      value_.ResetMemory();
+      value_.inline_value = std::move(other.value_.inline_value);
+      other.value_.ResetMemory();
+      other.value_.heap_value = HeapValue();
+      is_inline_ = true;
+      other.is_inline_ = false;
+    } else {
+      value_.ResetMemory();
+      value_.heap_value = std::move(other.value_.heap_value);
+      other.value_.ResetMemory();
+      other.value_.heap_value = HeapValue();
+      is_inline_ = false;
+      other.is_inline_ = false;
+    }
+  } else if (other.is_empty()) {
+    if (IsInlineValue()) {
+      other.value_.ResetMemory();
+      other.value_.inline_value = std::move(value_.inline_value);
+      value_.ResetMemory();
+      value_.heap_value = HeapValue();
+      other.is_inline_ = true;
+      is_inline_ = false;
+    } else {
+      other.value_.ResetMemory();
+      other.value_.heap_value = std::move(value_.heap_value);
+      value_.ResetMemory();
+      value_.heap_value = HeapValue();
+      other.is_inline_ = false;
+      is_inline_ = false;
+    }
+  } else {  // Both Variants have values.
+    if (other.IsInlineValue() && IsInlineValue()) {
+      std::swap(value_.inline_value, other.value_.inline_value);
+    } else if (!other.IsInlineValue() && !IsInlineValue()) {
+      std::swap(value_.heap_value, other.value_.heap_value);
+    } else if (other.IsInlineValue() && !IsInlineValue()) {
+      HeapValue v = std::move(value_.heap_value);
+      value_.ResetMemory();
+      value_.inline_value = std::move(other.value_.inline_value);
+      other.value_.ResetMemory();
+      other.value_.heap_value = std::move(v);
+      is_inline_ = true;
+      other.is_inline_ = false;
+    } else {  // !other.IsInlineValue() && IsInlineValue()
+      HeapValue v = std::move(other.value_.heap_value);
+      other.value_.ResetMemory();
+      other.value_.inline_value = std::move(value_.inline_value);
+      value_.ResetMemory();
+      value_.heap_value = std::move(v);
+      is_inline_ = false;
+      other.is_inline_ = true;
+    }
+  }
+}
+
 template <>
 void* Variant::get();
 
diff --git a/tensorflow/core/framework/variant_tensor_data.h b/tensorflow/core/framework/variant_tensor_data.h
index d98cf6b5e1f..8c654ccec82 100644
--- a/tensorflow/core/framework/variant_tensor_data.h
+++ b/tensorflow/core/framework/variant_tensor_data.h
@@ -62,6 +62,10 @@ class VariantTensorData {
     return GetMetadata<T>(value, PODResolver<T>());
   }
 
+  string& metadata_string() { return metadata_; }
+
+  const string& metadata_string() const { return metadata_; }
+
   // Tensors contained within objects being serialized.
   int tensors_size() const;
   const Tensor& tensors(int index) const;
diff --git a/tensorflow/core/framework/variant_test.cc b/tensorflow/core/framework/variant_test.cc
index 8947f93887a..f12b0ea1e61 100644
--- a/tensorflow/core/framework/variant_test.cc
+++ b/tensorflow/core/framework/variant_test.cc
@@ -13,15 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <vector>
-
 #include "tensorflow/core/framework/variant.h"
-#include "tensorflow/core/framework/variant_encode_decode.h"
-#include "tensorflow/core/framework/variant_tensor_data.h"
+
+#include <xmmintrin.h>
+
+#include <vector>
 
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -29,17 +32,206 @@ namespace tensorflow {
 
 namespace {
 
-template <typename T>
+template <typename T, bool BIG>
 struct Wrapper {
   T value;
+  char big[BIG ? 256 : 0];
   string TypeName() const { return "POD"; }
 };
 
-using Int = Wrapper<int>;
-using Float = Wrapper<float>;
+template <bool BIG>
+using Int = Wrapper<int, BIG>;
+
+template <bool BIG>
+using Float = Wrapper<float, BIG>;
+
+template <bool BIG>
+class MaybeAlive {
+ public:
+  MaybeAlive() : alive_(false) {}
+
+  explicit MaybeAlive(bool alive) : alive_(alive) {
+    if (alive) ++live_counter_;
+  }
+
+  ~MaybeAlive() {
+    if (alive_) --live_counter_;
+  }
+
+  MaybeAlive(const MaybeAlive& rhs) : alive_(rhs.alive_) {
+    if (alive_) ++live_counter_;
+  }
+
+  MaybeAlive& operator=(const MaybeAlive& rhs) {
+    if (this == &rhs) return *this;
+    if (alive_) --live_counter_;
+    alive_ = rhs.alive_;
+    if (alive_) ++live_counter_;
+    return *this;
+  }
+
+  MaybeAlive(MaybeAlive&& rhs) : alive_(false) {
+    alive_ = std::move(rhs.alive_);
+    if (alive_) ++live_counter_;
+  }
+
+  MaybeAlive& operator=(MaybeAlive&& rhs) {
+    if (this == &rhs) return *this;
+    if (alive_) --live_counter_;
+    alive_ = std::move(rhs.alive_);
+    if (alive_) ++live_counter_;
+    return *this;
+  }
+
+  static int LiveCounter() { return live_counter_; }
+
+  string TypeName() const { return "MaybeAlive"; }
+  void Encode(VariantTensorData* data) const {}
+  bool Decode(VariantTensorData data) { return false; }
+
+ private:
+  bool alive_;
+  char big_[BIG ? 256 : 0];
+  static int live_counter_;
+};
+
+template <>
+int MaybeAlive<false>::live_counter_ = 0;
+template <>
+int MaybeAlive<true>::live_counter_ = 0;
+
+template <bool BIG>
+class DeleteCounter {
+ public:
+  DeleteCounter() : big_{}, counter_(nullptr) {}
+  explicit DeleteCounter(int* counter) : big_{}, counter_(counter) {}
+  ~DeleteCounter() {
+    if (counter_) ++*counter_;
+  }
+  // Need custom move operations because int* just gets copied on move, but we
+  // need to clear counter_ on move.
+  DeleteCounter& operator=(const DeleteCounter& rhs) = default;
+  DeleteCounter& operator=(DeleteCounter&& rhs) {
+    if (this == &rhs) return *this;
+    counter_ = rhs.counter_;
+    rhs.counter_ = nullptr;
+    return *this;
+  }
+  DeleteCounter(DeleteCounter&& rhs) {
+    counter_ = rhs.counter_;
+    rhs.counter_ = nullptr;
+  }
+  DeleteCounter(const DeleteCounter& rhs) = default;
+  char big_[BIG ? 256 : 0];
+  int* counter_;
+
+  string TypeName() const { return "DeleteCounter"; }
+  void Encode(VariantTensorData* data) const {}
+  bool Decode(VariantTensorData data) { return false; }
+};
 
 }  // end namespace
 
+TEST(VariantTest, MoveAndCopyBetweenBigAndSmall) {
+  Variant x;
+  int deleted_big = 0;
+  int deleted_small = 0;
+  x = DeleteCounter</*BIG=*/true>(&deleted_big);
+  EXPECT_EQ(deleted_big, 0);
+  x = DeleteCounter</*BIG=*/false>(&deleted_small);
+  EXPECT_EQ(deleted_big, 1);
+  EXPECT_EQ(deleted_small, 0);
+  x = DeleteCounter</*BIG=*/true>(&deleted_big);
+  EXPECT_EQ(deleted_big, 1);
+  EXPECT_EQ(deleted_small, 1);
+  x.clear();
+  EXPECT_EQ(deleted_big, 2);
+  EXPECT_EQ(deleted_small, 1);
+  DeleteCounter</*BIG=*/true> big(&deleted_big);
+  DeleteCounter</*BIG=*/false> small(&deleted_small);
+  EXPECT_EQ(deleted_big, 2);
+  EXPECT_EQ(deleted_small, 1);
+  x = big;
+  EXPECT_EQ(deleted_big, 2);
+  EXPECT_EQ(deleted_small, 1);
+  x = small;
+  EXPECT_EQ(deleted_big, 3);
+  EXPECT_EQ(deleted_small, 1);
+  x = std::move(big);
+  EXPECT_EQ(deleted_big, 3);
+  EXPECT_EQ(deleted_small, 2);
+  x = std::move(small);
+  EXPECT_EQ(deleted_big, 4);
+  EXPECT_EQ(deleted_small, 2);
+  x.clear();
+  EXPECT_EQ(deleted_big, 4);
+  EXPECT_EQ(deleted_small, 3);
+}
+
+TEST(VariantTest, MoveAndCopyBetweenBigAndSmallVariants) {
+  int deleted_big = 0;
+  int deleted_small = 0;
+  {
+    Variant x = DeleteCounter</*BIG=*/true>(&deleted_big);
+    Variant y = DeleteCounter</*BIG=*/false>(&deleted_small);
+    EXPECT_EQ(deleted_big, 0);
+    EXPECT_EQ(deleted_small, 0);
+    x = y;
+    EXPECT_EQ(deleted_big, 1);
+    EXPECT_EQ(deleted_small, 0);
+    x = x;
+    EXPECT_EQ(deleted_big, 1);
+    EXPECT_EQ(deleted_small, 0);
+    EXPECT_NE(x.get<DeleteCounter<false>>(), nullptr);
+    EXPECT_NE(y.get<DeleteCounter<false>>(), nullptr);
+    x = std::move(y);
+    EXPECT_EQ(deleted_small, 1);
+    EXPECT_NE(x.get<DeleteCounter<false>>(), nullptr);
+  }
+  EXPECT_EQ(deleted_big, 1);
+  EXPECT_EQ(deleted_small, 2);
+
+  deleted_big = 0;
+  deleted_small = 0;
+  {
+    Variant x = DeleteCounter</*BIG=*/false>(&deleted_small);
+    Variant y = DeleteCounter</*BIG=*/true>(&deleted_big);
+    EXPECT_EQ(deleted_big, 0);
+    EXPECT_EQ(deleted_small, 0);
+    x = y;
+    EXPECT_EQ(deleted_big, 0);
+    EXPECT_EQ(deleted_small, 1);
+    x = x;
+    EXPECT_EQ(deleted_big, 0);
+    EXPECT_EQ(deleted_small, 1);
+    EXPECT_NE(x.get<DeleteCounter<true>>(), nullptr);
+    EXPECT_NE(y.get<DeleteCounter<true>>(), nullptr);
+    x = std::move(y);
+    EXPECT_EQ(deleted_big, 1);
+    EXPECT_NE(x.get<DeleteCounter<true>>(), nullptr);
+  }
+  EXPECT_EQ(deleted_big, 2);
+  EXPECT_EQ(deleted_small, 1);
+}
+
+template <bool BIG>
+void TestDestructOnVariantMove() {
+  CHECK_EQ(MaybeAlive<BIG>::LiveCounter(), 0);
+  {
+    Variant a = MaybeAlive<BIG>(true);
+    Variant b = std::move(a);
+  }
+  EXPECT_EQ(MaybeAlive<BIG>::LiveCounter(), 0);
+}
+
+TEST(VariantTest, RHSDestructOnVariantMoveBig) {
+  TestDestructOnVariantMove</*BIG=*/true>();
+}
+
+TEST(VariantTest, RHSDestructOnVariantMoveSmall) {
+  TestDestructOnVariantMove</*BIG=*/false>();
+}
+
 TEST(VariantTest, Int) {
   Variant x;
   EXPECT_EQ(x.get<void>(), nullptr);
@@ -49,45 +241,125 @@ TEST(VariantTest, Int) {
   EXPECT_EQ(x.TypeName(), "int");
 }
 
-TEST(VariantTest, Basic) {
+struct MayCreateAlignmentDifficulties {
+  int a;
+  __m128 b;
+};
+
+bool M128AllEqual(const __m128& a, const __m128& b) {
+  return _mm_movemask_ps(_mm_cmpeq_ps(a, b)) == 0xf;
+}
+
+TEST(VariantTest, NotAlignable) {
+  Variant x;
+  EXPECT_EQ(x.get<void>(), nullptr);
+  __m128 v = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
+  x = MayCreateAlignmentDifficulties{-1, v};
+  EXPECT_NE(x.get<void>(), nullptr);
+  auto* x_val = x.get<MayCreateAlignmentDifficulties>();
+  // check that *x_val == x
+  Variant y = x;
+  EXPECT_EQ(x_val->a, -1);
+  EXPECT_TRUE(M128AllEqual(x_val->b, v));
+  auto* y_val = y.get<MayCreateAlignmentDifficulties>();
+  EXPECT_EQ(y_val->a, -1);
+  EXPECT_TRUE(M128AllEqual(y_val->b, v));
+  Variant z = std::move(y);
+  auto* z_val = z.get<MayCreateAlignmentDifficulties>();
+  EXPECT_EQ(z_val->a, -1);
+  EXPECT_TRUE(M128AllEqual(z_val->b, v));
+}
+
+template <bool BIG>
+void TestBasic() {
   Variant x;
   EXPECT_EQ(x.get<void>(), nullptr);
 
-  x = Int{42};
+  x = Int<BIG>{42};
 
   EXPECT_NE(x.get<void>(), nullptr);
-  EXPECT_NE(x.get<Int>(), nullptr);
-  EXPECT_EQ(x.get<Int>()->value, 42);
+  EXPECT_NE(x.get<Int<BIG>>(), nullptr);
+  EXPECT_EQ(x.get<Int<BIG>>()->value, 42);
   EXPECT_EQ(x.TypeName(), "POD");
 }
 
-TEST(VariantTest, ConstGet) {
+TEST(VariantTest, Basic) { TestBasic<false>(); }
+
+TEST(VariantTest, BasicBig) { TestBasic<true>(); }
+
+template <bool BIG>
+void TestConstGet() {
   Variant x;
   EXPECT_EQ(x.get<void>(), nullptr);
 
-  x = Int{42};
+  x = Int<BIG>{42};
 
   const Variant y = x;
 
   EXPECT_NE(y.get<void>(), nullptr);
-  EXPECT_NE(y.get<Int>(), nullptr);
-  EXPECT_EQ(y.get<Int>()->value, 42);
+  EXPECT_NE(y.get<Int<BIG>>(), nullptr);
+  EXPECT_EQ(y.get<Int<BIG>>()->value, 42);
 }
 
-TEST(VariantTest, Clear) {
+TEST(VariantTest, ConstGet) { TestConstGet<false>(); }
+
+TEST(VariantTest, ConstGetBig) { TestConstGet<true>(); }
+
+template <bool BIG>
+void TestClear() {
   Variant x;
   EXPECT_EQ(x.get<void>(), nullptr);
 
-  x = Int{42};
+  x = Int<BIG>{42};
 
   EXPECT_NE(x.get<void>(), nullptr);
-  EXPECT_NE(x.get<Int>(), nullptr);
-  EXPECT_EQ(x.get<Int>()->value, 42);
+  EXPECT_NE(x.get<Int<BIG>>(), nullptr);
+  EXPECT_EQ(x.get<Int<BIG>>()->value, 42);
 
   x.clear();
   EXPECT_EQ(x.get<void>(), nullptr);
 }
 
+TEST(VariantTest, Clear) { TestClear<false>(); }
+
+TEST(VariantTest, ClearBig) { TestClear<true>(); }
+
+template <bool BIG>
+void TestClearDeletes() {
+  Variant x;
+  EXPECT_EQ(x.get<void>(), nullptr);
+
+  int deleted_count = 0;
+  using DC = DeleteCounter<BIG>;
+  DC dc(&deleted_count);
+  EXPECT_EQ(deleted_count, 0);
+  x = dc;
+  EXPECT_EQ(deleted_count, 0);
+
+  EXPECT_NE(x.get<void>(), nullptr);
+  EXPECT_NE(x.get<DC>(), nullptr);
+
+  x.clear();
+  EXPECT_EQ(x.get<void>(), nullptr);
+  EXPECT_EQ(deleted_count, 1);
+
+  x = dc;
+  EXPECT_EQ(deleted_count, 1);
+
+  Variant y = x;
+  EXPECT_EQ(deleted_count, 1);
+
+  x.clear();
+  EXPECT_EQ(deleted_count, 2);
+
+  y.clear();
+  EXPECT_EQ(deleted_count, 3);
+}
+
+TEST(VariantTest, ClearDeletesOnHeap) { TestClearDeletes</*BIG=*/true>(); }
+
+TEST(VariantTest, ClearDeletesOnStack) { TestClearDeletes</*BIG=*/false>(); }
+
 TEST(VariantTest, Tensor) {
   Variant x;
   Tensor t(DT_FLOAT, {});
@@ -101,6 +373,16 @@ TEST(VariantTest, Tensor) {
   EXPECT_EQ(x.TypeName(), "tensorflow::Tensor");
 }
 
+TEST(VariantTest, NontrivialTensorVariantCopy) {
+  Tensor variants(DT_VARIANT, {});
+  Tensor t(true);
+  test::FillValues<Variant>(&variants, gtl::ArraySlice<Variant>({t}));
+  const Tensor* t_c = variants.flat<Variant>()(0).get<Tensor>();
+  EXPECT_EQ(t_c->dtype(), t.dtype());
+  EXPECT_EQ(t_c->shape(), t.shape());
+  EXPECT_EQ(t_c->scalar<bool>()(), t.scalar<bool>()());
+}
+
 TEST(VariantTest, TensorProto) {
   Variant x;
   TensorProto t;
@@ -114,31 +396,41 @@ TEST(VariantTest, TensorProto) {
   EXPECT_EQ(x.get<TensorProto>()->tensor_shape().unknown_rank(), true);
 }
 
-TEST(VariantTest, CopyValue) {
+template <bool BIG>
+void TestCopyValue() {
   Variant x, y;
-  x = Int{10};
+  x = Int<BIG>{10};
   y = x;
 
-  EXPECT_EQ(x.get<Int>()->value, 10);
-  EXPECT_EQ(x.get<Int>()->value, y.get<Int>()->value);
+  EXPECT_EQ(x.get<Int<BIG>>()->value, 10);
+  EXPECT_EQ(x.get<Int<BIG>>()->value, y.get<Int<BIG>>()->value);
 }
 
-TEST(VariantTest, MoveValue) {
+TEST(VariantTest, CopyValue) { TestCopyValue<false>(); }
+
+TEST(VariantTest, CopyValueBig) { TestCopyValue<true>(); }
+
+template <bool BIG>
+void TestMoveValue() {
   Variant x;
   x = []() -> Variant {
     Variant y;
-    y = Int{10};
+    y = Int<BIG>{10};
     return y;
   }();
-  EXPECT_EQ(x.get<Int>()->value, 10);
+  EXPECT_EQ(x.get<Int<BIG>>()->value, 10);
 }
 
+TEST(VariantTest, MoveValue) { TestMoveValue<false>(); }
+
+TEST(VariantTest, MoveValueBig) { TestMoveValue<true>(); }
+
 TEST(VariantTest, TypeMismatch) {
   Variant x;
-  x = Int{10};
+  x = Int<false>{10};
   EXPECT_EQ(x.get<float>(), nullptr);
   EXPECT_EQ(x.get<int>(), nullptr);
-  EXPECT_NE(x.get<Int>(), nullptr);
+  EXPECT_NE(x.get<Int<false>>(), nullptr);
 }
 
 struct TensorList {
@@ -206,19 +498,26 @@ TEST(VariantTest, TensorListTest) {
                 "Variant<type: TensorList value: ", data.DebugString(), ">"));
 }
 
-TEST(VariantTest, VariantArray) {
+template <bool BIG>
+void TestVariantArray() {
   Variant x[2];
-  x[0] = Int{2};
-  x[1] = Float{2.0f};
+  x[0] = Int<BIG>{2};
+  x[1] = Float<BIG>{2.0f};
 
-  EXPECT_EQ(x[0].get<Int>()->value, 2);
-  EXPECT_EQ(x[1].get<Float>()->value, 2.0f);
+  EXPECT_EQ(x[0].get<Int<BIG>>()->value, 2);
+  EXPECT_EQ(x[1].get<Float<BIG>>()->value, 2.0f);
 }
 
-TEST(VariantTest, PodUpdate) {
+TEST(VariantTest, VariantArray) { TestVariantArray<false>(); }
+
+TEST(VariantTest, VariantArrayBig) { TestVariantArray<true>(); }
+
+template <bool BIG>
+void PodUpdateTest() {
   struct Pod {
     int x;
     float y;
+    char big[BIG ? 256 : 0];
 
     string TypeName() const { return "POD"; }
   };
@@ -232,10 +531,16 @@ TEST(VariantTest, PodUpdate) {
   EXPECT_EQ(x.get<Pod>()->x, 30);
 }
 
-TEST(VariantTest, EncodeDecodePod) {
+TEST(VariantTest, PodUpdate) { PodUpdateTest<false>(); }
+
+TEST(VariantTest, PodUpdateBig) { PodUpdateTest<true>(); }
+
+template <bool BIG>
+void TestEncodeDecodePod() {
   struct Pod {
     int x;
     float y;
+    char big[BIG ? 256 : 0];
 
     string TypeName() const { return "POD"; }
   };
@@ -247,14 +552,17 @@ TEST(VariantTest, EncodeDecodePod) {
   VariantTensorData serialized;
   x.Encode(&serialized);
 
-  Variant y;
-  y = Pod();
+  Variant y = Pod{};
   y.Decode(serialized);
 
   EXPECT_EQ(p.x, y.get<Pod>()->x);
   EXPECT_EQ(p.y, y.get<Pod>()->y);
 }
 
+TEST(VariantTest, EncodeDecodePod) { TestEncodeDecodePod<false>(); }
+
+TEST(VariantTest, EncodeDecodePodBig) { TestEncodeDecodePod<true>(); }
+
 TEST(VariantTest, EncodeDecodeTensor) {
   Variant x;
   Tensor t(DT_INT32, {});
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index 3be500d592b..6574f3bf622 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -315,9 +315,15 @@ Status Node::input_tensor(int idx, OutputTensor* t) const {
 // NodeDebugInfo
 
 NodeDebugInfo::NodeDebugInfo(const Node& n) : NodeDebugInfo(n.def()) {}
-NodeDebugInfo::NodeDebugInfo(const NodeDef& ndef) : name(ndef.name()) {
-  if (ndef.has_experimental_debug_info()) {
-    const auto& names = ndef.experimental_debug_info().original_node_names();
+NodeDebugInfo::NodeDebugInfo(const NodeDef& ndef)
+    : NodeDebugInfo(ndef.name(), ndef.has_experimental_debug_info(),
+                    ndef.experimental_debug_info()) {}
+NodeDebugInfo::NodeDebugInfo(
+    StringPiece node_name, bool has_experimental_debug_info,
+    const NodeDef_ExperimentalDebugInfo& experimental_debug_info)
+    : name(node_name) {
+  if (has_experimental_debug_info) {
+    const auto& names = experimental_debug_info.original_node_names();
     original_node_names.assign(names.begin(), names.end());
   }
 }
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 197058ef4df..6913f50aa74 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -40,7 +40,9 @@ limitations under the License.
 #include <functional>
 #include <string>
 #include <vector>
+
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/edgeset.h"
@@ -309,6 +311,8 @@ struct NodeDebugInfo {
 
   NodeDebugInfo(const Node& n);
   NodeDebugInfo(const NodeDef& ndef);
+  NodeDebugInfo(StringPiece node_name, bool has_experimental_debug_info,
+                const NodeDef_ExperimentalDebugInfo& experimental_debug_info);
 };
 
 // Represents an input of a node, i.e., the `index`-th input to `node`.
@@ -766,15 +770,20 @@ inline bool IsHostMemoryPreserving(const Node* node) {
   return IsIdentity(node) || IsControlFlow(node);
 }
 
+// NOTE: We declare Reference type of NodeIter and NeighborIter as Node* (see
+// https://en.cppreference.com/w/cpp/iterator/iterator).
+
 // Iterator for stepping through the nodes of a graph.
-class NodeIter {
+class NodeIter
+    : public std::iterator<std::forward_iterator_tag, Node, std::ptrdiff_t,
+                           /*Pointer*/ Node*, /*Reference*/ Node*> {
  public:
   NodeIter(const Graph* graph, int id);
-  bool operator==(const NodeIter& rhs);
-  bool operator!=(const NodeIter& rhs);
+  bool operator==(const NodeIter& rhs) const;
+  bool operator!=(const NodeIter& rhs) const;
   void operator++();
-  Node* operator*();
-  Node* operator->();
+  reference operator*() const;
+  pointer operator->() const;
 
  private:
   // Invariant: id_ == graph_->num_node_ids() || graph_->FindId(id_) != nullptr
@@ -783,14 +792,16 @@ class NodeIter {
 };
 
 // Iterator for stepping through the neighbors of a node.
-class NeighborIter {
+class NeighborIter
+    : public std::iterator<std::forward_iterator_tag, Node, std::ptrdiff_t,
+                           /*Pointer*/ Node*, /*Reference*/ Node*> {
  public:
   NeighborIter(EdgeSet::const_iterator iter, bool incoming);
-  bool operator==(const NeighborIter& rhs);
-  bool operator!=(const NeighborIter& rhs);
+  bool operator==(const NeighborIter& rhs) const;
+  bool operator!=(const NeighborIter& rhs) const;
   void operator++();
-  Node* operator*();
-  Node* operator->();
+  reference operator*() const;
+  pointer operator->() const;
 
  private:
   EdgeSet::const_iterator iter_;
@@ -802,12 +813,12 @@ class NeighborIter {
 inline NodeIter::NodeIter(const Graph* graph, int id)
     : graph_(graph), id_(id) {}
 
-inline bool NodeIter::operator==(const NodeIter& rhs) {
+inline bool NodeIter::operator==(const NodeIter& rhs) const {
   DCHECK(graph_ == rhs.graph_);
   return id_ == rhs.id_;
 }
 
-inline bool NodeIter::operator!=(const NodeIter& rhs) {
+inline bool NodeIter::operator!=(const NodeIter& rhs) const {
   return !(*this == rhs);
 }
 
@@ -821,29 +832,29 @@ inline void NodeIter::operator++() {
   }
 }
 
-inline Node* NodeIter::operator*() { return graph_->FindNodeId(id_); }
+inline Node* NodeIter::operator*() const { return graph_->FindNodeId(id_); }
 
-inline Node* NodeIter::operator->() { return graph_->FindNodeId(id_); }
+inline Node* NodeIter::operator->() const { return graph_->FindNodeId(id_); }
 
 inline NeighborIter::NeighborIter(EdgeSet::const_iterator iter, bool incoming)
     : iter_(iter), incoming_(incoming) {}
 
-inline bool NeighborIter::operator==(const NeighborIter& rhs) {
+inline bool NeighborIter::operator==(const NeighborIter& rhs) const {
   return iter_ == rhs.iter_ && incoming_ == rhs.incoming_;
 }
 
-inline bool NeighborIter::operator!=(const NeighborIter& rhs) {
+inline bool NeighborIter::operator!=(const NeighborIter& rhs) const {
   return !(*this == rhs);
 }
 
 inline void NeighborIter::operator++() { ++iter_; }
 
-inline Node* NeighborIter::operator*() {
+inline Node* NeighborIter::operator*() const {
   const Edge* e = *iter_;
   return incoming_ ? e->src() : e->dst();
 }
 
-inline Node* NeighborIter::operator->() {
+inline Node* NeighborIter::operator->() const {
   const Edge* e = *iter_;
   return incoming_ ? e->src() : e->dst();
 }
diff --git a/tensorflow/core/grappler/clusters/BUILD b/tensorflow/core/grappler/clusters/BUILD
index cf321f9c19b..fc550900d33 100644
--- a/tensorflow/core/grappler/clusters/BUILD
+++ b/tensorflow/core/grappler/clusters/BUILD
@@ -132,6 +132,7 @@ tf_cc_test(
     tags = [
         "no_cuda_on_cpu_tap",
         "no_gpu",
+        "nomsan",  # TODO(b/132138608): Re-enable this.
     ],
     deps = [
         ":single_machine",
diff --git a/tensorflow/core/grappler/clusters/utils.cc b/tensorflow/core/grappler/clusters/utils.cc
index f1d3a77e3f0..f7af7cc374f 100644
--- a/tensorflow/core/grappler/clusters/utils.cc
+++ b/tensorflow/core/grappler/clusters/utils.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include "third_party/eigen3/Eigen/Core"
 
 #if GOOGLE_CUDA
-#include "cuda/include/cuda.h"
-#include "cuda/include/cuda_runtime_api.h"
-#include "cuda/include/cudnn.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#include "third_party/gpus/cudnn/cudnn.h"
 #endif
 
 #if TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index cd47188b65e..bc69e77a37f 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -112,31 +112,25 @@ void LIFOManager::RemoveCurrNode() {
   curr_pos_ = nodes_.end();  // Reset curr_pos_.
 }
 
-FirstReadyManager::FirstReadyManager() : ReadyNodeManager() {
+HeapReadyManager::HeapReadyManager() : ReadyNodeManager() {
   std::make_heap(nodes_.begin(), nodes_.end());
 }
 
-Status FirstReadyManager::Init(
+Status HeapReadyManager::Init(
     const std::unordered_map<const NodeDef*, NodeState>* node_map) {
-  // Reset the node state since different instances of the scheduler can reuse
+  // Resets the node state since different instances of the scheduler can reuse
   // the same node_manager.
   node_map_ = node_map;
   nodes_.clear();
   waiting_queue_.clear();
-  greater_ = [this](const NodeDef* a, const NodeDef* b) -> bool {
-    if (node_map_->at(a).time_ready == node_map_->at(b).time_ready) {
-      // Use Node name as tie-breaker for deterministic node scheduling.
-      return a->name().compare(b->name()) > 0;
-    } else {
-      // Note: we need a node with minimum time_ready, not maximum; hence, using
-      // a > b for comparison function.
-      return node_map_->at(a).time_ready > node_map_->at(b).time_ready;
-    }
-  };
+
+  // Sets up the comparator for the heap.
+  greater_ = Greater();
+
   return Status::OK();
 }
 
-const NodeDef* FirstReadyManager::GetCurrNode() {
+const NodeDef* HeapReadyManager::GetCurrNode() {
   if (nodes_.empty()) {
     // Nothing in the node_; probably, the very first call. Move waiting_queue_
     // to node_.
@@ -146,7 +140,7 @@ const NodeDef* FirstReadyManager::GetCurrNode() {
   return nodes_.front();
 }
 
-void FirstReadyManager::RemoveCurrNode() {
+void HeapReadyManager::RemoveCurrNode() {
   if (nodes_.empty()) {
     // Make sure that there is a node to be removed at the front of nodes_.
     GetCurrNode();
@@ -156,11 +150,11 @@ void FirstReadyManager::RemoveCurrNode() {
   DrainWaitingQueue();
 }
 
-bool FirstReadyManager::Empty() const {
+bool HeapReadyManager::Empty() const {
   return nodes_.empty() && waiting_queue_.empty();
 }
 
-void FirstReadyManager::DrainWaitingQueue() {
+void HeapReadyManager::DrainWaitingQueue() {
   for (const auto* node : waiting_queue_) {
     // push_heap in AddNode() and pop_heap in RemoveCurrNode() guarantees that
     // the first element is the node with minimum time_ready.
@@ -170,6 +164,44 @@ void FirstReadyManager::DrainWaitingQueue() {
   waiting_queue_.clear();
 }
 
+std::function<bool(const NodeDef*, const NodeDef*)>
+FirstReadyManager::Greater() {
+  auto greater = [this](const NodeDef* a, const NodeDef* b) -> bool {
+    if (node_map_->at(a).time_ready == node_map_->at(b).time_ready) {
+      // Use Node name as tie-breaker for deterministic node scheduling.
+      return a->name().compare(b->name()) > 0;
+    } else {
+      // Note: we need a node with minimum time_ready, not maximum; hence, using
+      // a > b for comparison function.
+      return node_map_->at(a).time_ready > node_map_->at(b).time_ready;
+    }
+  };
+  return greater;
+}
+
+std::function<bool(const NodeDef*, const NodeDef*)>
+PriorityReadyManager::Greater() {
+  auto greater = [this](const NodeDef* a, const NodeDef* b) -> bool {
+    return node_priority_.at(a->name()) > node_priority_.at(b->name());
+  };
+  return greater;
+}
+
+Status PriorityReadyManager::SetPriority(
+    const std::unordered_map<string, int>& node_priority) {
+  // Checks each node has a unique priority.
+  std::unordered_set<int> priorities;
+  for (const auto& it : node_priority_) {
+    if (priorities.find(it.second) != priorities.end()) {
+      return errors::InvalidArgument("Non-unique priority found");
+    }
+    priorities.insert(it.second);
+  }
+
+  node_priority_ = node_priority;
+  return Status::OK();
+}
+
 CompositeNodeManager::CompositeNodeManager()
     : ReadyNodeManager(), send_manager_(), recv_manager_() {}
 
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index 47d5cc23a5a..821353fd301 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -180,21 +180,22 @@ class LIFOManager : public ReadyNodeManager {
   std::list<const NodeDef*>::iterator curr_pos_ = nodes_.end();
 };
 
-// FirstReadyManager picks a node with the minimum time_ready value.
-// Behavior is deterministic when there are more than one nodes with the minimum
-// time_ready value with unique node names as the tie-breaker.
-class FirstReadyManager : public ReadyNodeManager {
+// Abstract class that maintains a heap/priority queue for scheduling ready
+// nodes. Derived class needs to implement the Greater() function which returns
+// the comparator for the heap.
+class HeapReadyManager : public ReadyNodeManager {
  public:
-  FirstReadyManager();
+  HeapReadyManager();
   Status Init(
       const std::unordered_map<const NodeDef*, NodeState>* node_map) override;
-  ~FirstReadyManager() override {}
+  ~HeapReadyManager() override {}
   void AddNode(const NodeDef* node) override { waiting_queue_.push_back(node); }
   const NodeDef* GetCurrNode() override;
   void RemoveCurrNode() override;
   bool Empty() const override;
 
- private:
+ protected:
+  virtual std::function<bool(const NodeDef*, const NodeDef*)> Greater() = 0;
   // Move all the nodes in the waiting_queue_ to nodes_.
   void DrainWaitingQueue();
 
@@ -214,6 +215,37 @@ class FirstReadyManager : public ReadyNodeManager {
   const std::unordered_map<const NodeDef*, NodeState>* node_map_;
 };
 
+// FirstReadyManager picks a node with the minimum time_ready value.
+// Behavior is deterministic when there are more than one nodes with the minimum
+// time_ready value with unique node names as the tie-breaker.
+class FirstReadyManager : public HeapReadyManager {
+ public:
+  FirstReadyManager() : HeapReadyManager() {}
+  ~FirstReadyManager() override {}
+
+ protected:
+  std::function<bool(const NodeDef*, const NodeDef*)> Greater() override;
+};
+
+// PriorityReadyManager uses the given node priorities when picking up next node
+// from all the ready nodes.
+class PriorityReadyManager : public HeapReadyManager {
+ public:
+  PriorityReadyManager() : HeapReadyManager() {}
+  ~PriorityReadyManager() override {}
+
+  // Note this should be called after Init().
+  Status SetPriority(const std::unordered_map<string, int>& node_priority);
+
+ protected:
+  std::function<bool(const NodeDef*, const NodeDef*)> Greater() override;
+
+ private:
+  // A map from unique node name to unique priority. Lower number means higher
+  // priority.
+  std::unordered_map<string, int> node_priority_;
+};
+
 // CompositeNodeManager has a few other NodeManagers: per-device LIFO for normal
 // ops (neither _Send nor _Recv) and FirstReadyManagers for _Send ops and _Recv
 // ops, and then it chooses FirstReady among the ops chosen from each
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
index dd7c0c2c583..3e867e30c8b 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
@@ -372,6 +372,40 @@ TEST_F(ReadyNodeManagerTest, DeterminismInFirstReadyManager) {
   EXPECT_TRUE(manager2.Empty());
 }
 
+TEST_F(ReadyNodeManagerTest, GetAndRemoveMultiplePriorityReadyManager) {
+  PriorityReadyManager manager;
+  TF_EXPECT_OK(manager.Init(&node_states_));
+
+  // Sets up node priorities.
+  std::unordered_map<string, int> node_priority = {{"Node1", 1}, {"Node2", 2},
+                                                   {"Node3", 3}, {"Node4", 4},
+                                                   {"Node5", 5}, {"Node6", 6}};
+  TF_EXPECT_OK(manager.SetPriority(node_priority));
+
+  // Inserts nodes in some random order.
+  manager.AddNode(&node2_);
+  manager.AddNode(&node1_);
+  manager.AddNode(&node4_);
+  manager.AddNode(&node5_);
+  manager.AddNode(&node3_);
+  manager.AddNode(&node6_);
+
+  // Expects nodes scheduled based on priority.
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node1");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node2");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node3");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node4");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node5");
+  manager.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), "Node6");
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+}
+
 TEST_F(ReadyNodeManagerTest, RemoveSingleNodeCompositeNodeManager) {
   CompositeNodeManager manager;
   TF_EXPECT_OK(manager.Init(&node_states_));
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 431f8f9a989..49de09ac748 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -268,6 +268,7 @@ cc_library(
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/utils:canonicalizer",
         "//tensorflow/core/grappler/utils:symbolic_shapes",
         "//tensorflow/core/grappler/utils:topological_sort",
         "//tensorflow/core/grappler/utils:traversal",
@@ -603,6 +604,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler/clusters:virtual_cluster",
+        "//tensorflow/core/grappler/utils:canonicalizer",
         "//tensorflow/core/grappler/utils:colocation",
         "//tensorflow/core/grappler/utils:functions",
         "//tensorflow/core/grappler/utils:topological_sort",
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 15f88752366..6f801d48f8b 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer_stage.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/canonicalizer.h"
 #include "tensorflow/core/grappler/utils/symbolic_shapes.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/grappler/utils/traversal.h"
@@ -2114,26 +2115,32 @@ class FoldMultiplyIntoConv : public ArithmeticOptimizerStage {
     TF_RETURN_IF_TRUE(NumNonControlOutputs(*source, *ctx().node_map) != 1);
 
     const NodeDef* mul = source;
-
-    // TODO(jingyue): handle the case where `scale` is 0-th operand.
-    NodeDef* scale;  // scalar multiplier fot the input tensor
+    int input_idx = 0;
+    int scale_idx = 1;
+    NodeDef* scale;  // scalar multiplier for the input tensor
     NodeDef* input;
-    TF_RETURN_IF_ERROR(GetInputNode(mul->input(1), &scale));
-    TF_RETURN_IF_ERROR(GetInputNode(mul->input(0), &input));
-
-    // Check that 'scale * weight' can be const folded.
+    TF_RETURN_IF_ERROR(GetInputNode(mul->input(scale_idx), &scale));
+    TF_RETURN_IF_ERROR(GetInputNode(mul->input(input_idx), &input));
+    if (!IsConstant(*scale) && IsConstant(*input)) {
+      VLOG(3) << "Swapped inputs to mul";
+      std::swap(scale_idx, input_idx);
+      std::swap(scale, input);
+    }
     TF_RETURN_IF_TRUE(!IsConstant(*scale));
-    TF_RETURN_IF_ERROR(CheckAttrsExist(*scale, {"dtype", "value"}));
-    TF_RETURN_IF_ERROR(CheckAttrExists(*weights, "dtype"));
-    TF_RETURN_IF_TRUE(scale->attr().at("dtype").type() !=
-                      weights->attr().at("dtype").type());
 
-    // Check that `scale` is a scalar.
+    // Check that one of the inputs to mul is a constant scalar.
     const TensorProto& scale_tensor = scale->attr().at("value").tensor();
     bool scale_is_a_scalar = scale_tensor.has_tensor_shape() &&
                              scale_tensor.tensor_shape().dim_size() == 0;
     TF_RETURN_IF_TRUE(!scale_is_a_scalar);
 
+    // Check that 'scale * weight' can be const folded.
+    TF_RETURN_IF_TRUE(!IsConstant(*scale));
+    TF_RETURN_IF_ERROR(CheckAttrsExist(*scale, {"dtype"}));
+    TF_RETURN_IF_ERROR(CheckAttrExists(*weights, "dtype"));
+    TF_RETURN_IF_TRUE(scale->attr().at("dtype").type() !=
+                      weights->attr().at("dtype").type());
+
     // At this point all preconditions are met, and we safely do the rewrite.
     VLOG(3) << "Fold multiply into conv: conv=" << conv->name()
             << " mul=" << mul->name() << " weights=" << weights->name();
@@ -2148,7 +2155,7 @@ class FoldMultiplyIntoConv : public ArithmeticOptimizerStage {
     // Link in its inputs.
     scaled_weights->add_input(conv->input(1));
     ctx().node_map->AddOutput(weights->name(), scaled_weights->name());
-    scaled_weights->add_input(mul->input(1));
+    scaled_weights->add_input(mul->input(scale_idx));
     ctx().node_map->AddOutput(scale->name(), scaled_weights->name());
     ForwardControlDependencies(scaled_weights, {source});
 
@@ -2159,7 +2166,7 @@ class FoldMultiplyIntoConv : public ArithmeticOptimizerStage {
     AddToOptimizationQueue(conv);
 
     // Update `tail` node to bypass `mul` because it's folded to the weights.
-    tail->set_input(0, mul->input(0));
+    tail->set_input(0, mul->input(input_idx));
     ctx().node_map->UpdateInput(tail->name(), mul->name(), input->name());
     AddToOptimizationQueue(tail);
     *simplified_node_name = conv->name();
@@ -3326,6 +3333,21 @@ class UniqueNodes {
     return node;
   }
 
+  void RemoveRepresentative(NodeDef* node) {
+    auto it = memoized_signatures_.find(node);
+    if (it == memoized_signatures_.end()) return;
+
+    std::vector<NodeDef*>& candidates = rep_[it->second];
+    for (int i = 0; i < candidates.size(); ++i) {
+      if (candidates[i] == node) {
+        std::swap(candidates[i], candidates[candidates.size() - 1]);
+        candidates.resize(candidates.size() - 1);
+        break;
+      }
+    }
+    memoized_signatures_.erase(node);
+  }
+
  private:
   uint64 ComputeSignature(const NodeDef& node);
   bool SameNode(const NodeDef& node1, const NodeDef& node2) const;
@@ -3355,6 +3377,9 @@ uint64 UniqueNodes::ComputeSignature(const NodeDef& node) {
   return h;
 }
 
+// PRECONDITION:
+//  Node input orders are assumed to be canonicalized, i.e. control inputs for
+//  all nodes as well as regular inputs for commutative nodes must be sorted.
 bool UniqueNodes::SameNode(const NodeDef& node1, const NodeDef& node2) const {
   if (node1.op() != node2.op()) {
     return false;
@@ -3370,38 +3395,13 @@ bool UniqueNodes::SameNode(const NodeDef& node1, const NodeDef& node2) const {
   }
 
   // Compare inputs.
-  if (IsCommutative(node1)) {
-    std::vector<string> inputs1(node1.input().begin(), node1.input().end());
-    std::sort(inputs1.begin(), inputs1.end());
-    std::vector<string> inputs2(node2.input().begin(), node2.input().end());
-    std::sort(inputs2.begin(), inputs2.end());
-    return inputs1 == inputs2;
-  } else {
-    // The order or ordinary inputs matters.
-    int index = 0;
-    for (; index < node1.input_size(); ++index) {
-      if (IsControlInput(node1.input(index))) {
-        break;
-      } else if (node1.input(index) != node2.input(index)) {
-        return false;
-      }
-    }
-    // The order of control inputs does not matter.
-    if (index < node1.input_size()) {
-      std::vector<string> ctrl_inputs1(node1.input().begin() + index,
-                                       node1.input().end());
-      std::sort(ctrl_inputs1.begin(), ctrl_inputs1.end());
-      std::vector<string> ctrl_inputs2(node2.input().begin() + index,
-                                       node2.input().end());
-      std::sort(ctrl_inputs2.begin(), ctrl_inputs2.end());
-      return ctrl_inputs1 != ctrl_inputs2;
-    }
+  auto it1 = node1.input().begin();
+  auto it2 = node2.input().begin();
+  for (; it1 != node1.input().end(); ++it1, ++it2) {
+    if (*it1 != *it2) return false;
   }
 
   // Compare attributes.
-  if (node1.attr().size() != node2.attr().size()) {
-    return false;
-  }
   for (const auto& attr1 : node1.attr()) {
     auto it = node2.attr().find(attr1.first);
     if (it == node2.attr().end()) return false;
@@ -3429,6 +3429,10 @@ bool ArithmeticOptimizer::CanDedup(const NodeDef& node) const {
 }
 
 void ArithmeticOptimizer::DedupComputations() {
+  CanonicalizeGraph(optimized_graph_);
+  // LOG(INFO) << "Graph after canonicalization: \n"
+  //           << optimized_graph_->DebugString();
+
   GraphTopologyView graph_view;
   if (!graph_view.InitializeFromGraph(*optimized_graph_).ok()) {
     LOG(WARNING) << "Failed to initialize GraphTopologyView.";
@@ -3478,26 +3482,38 @@ void ArithmeticOptimizer::DedupComputations() {
       if (feeds_inplace_op.find(rep) != feeds_inplace_op.end()) {
         continue;
       }
-      VLOG(3) << "Remove duplicated node: node=" << node->name()
-              << " representative=" << rep->name();
       const std::set<NodeDef*>& tmp = node_map_->GetOutputs(node->name());
       std::vector<NodeDef*> fanouts(tmp.begin(), tmp.end());
       for (NodeDef* fanout : fanouts) {
+        // Update consumers of node.
+        bool updated_fanout = false;
         for (int i = 0; i < fanout->input_size(); ++i) {
           string* fanout_input = fanout->mutable_input(i);
+
           const int position =
               NodePositionIfSameNode(*fanout_input, node->name());
           // Update name in-place.
           if (position < -1) {
             continue;
-          } else if (position > 0) {
-            *fanout_input = StrCat(rep->name(), ":", position);
-          } else if (position == 0) {
-            *fanout_input = rep->name();
           } else {
-            *fanout_input = StrCat("^", rep->name());
+            if (!updated_fanout) {
+              // The signature of the fanout node will change. Remove it from
+              // nodes.
+              nodes.RemoveRepresentative(fanout);
+            }
+            updated_fanout = true;
+            if (position > 0) {
+              *fanout_input = StrCat(rep->name(), ":", position);
+            } else if (position == 0) {
+              *fanout_input = rep->name();
+            } else {
+              *fanout_input = StrCat("^", rep->name());
+            }
           }
-          node_map_->AddOutput(rep->name(), fanout->name());
+        }
+        if (updated_fanout) {
+          node_map_->UpdateInput(fanout->name(), node->name(), rep->name());
+          CanonicalizeNode(fanout);
         }
       }
       duplicates.insert(i);
@@ -3513,21 +3529,6 @@ void ArithmeticOptimizer::DedupComputations() {
   }
 }
 
-void ArithmeticOptimizer::ForwardControlDependencies(
-    NodeDef* target_node, const std::vector<const NodeDef*>& src_nodes) {
-  for (const auto& src : src_nodes) {
-    for (int i = src->input_size() - 1; i >= 0; --i) {
-      if (IsControlInput(src->input(i))) {
-        *target_node->add_input() = src->input(i);
-        node_map_->AddOutput(NodeName(src->input(i)), target_node->name());
-      } else {
-        break;
-      }
-    }
-  }
-  DedupControlInputs(target_node);
-}
-
 Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
   SetVector<NodeDef*> nodes_to_simplify;
   nodes_to_simplify.Reserve(optimized_graph_->node_size());
@@ -3540,7 +3541,8 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
                                   &feed_nodes_, opt_level_);
   const ArithmeticOptimizerContext ctx_ext(&nodes_to_simplify);
 
-  // Stop pipeline after first stage returning non-empty simplified tensor name.
+  // Stop pipeline after first stage returning non-empty simplified tensor
+  // name.
   const auto stop = [](const string& result) { return !result.empty(); };
   GraphOptimizerStagePipeline<string> pipeline(stop);
 
@@ -3658,19 +3660,19 @@ Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/,
   fetch_nodes_known_ = !item.fetch.empty();
   GrapplerItem optimized_item(item);
   optimized_graph_ = &optimized_item.graph;
-  node_map_.reset(new NodeMap(optimized_graph_));
 
+  node_map_.reset(new NodeMap(optimized_graph_));
   for (const auto& feed : item.feed) {
     feed_nodes_.insert(NodeName(feed.first));
   }
 
-  // Disable restricted graph rewrites.
+  // // Disable restricted graph rewrites.
   options_.unary_ops_composition &=
       item.optimization_options().allow_non_differentiable_rewrites;
 
   // Perform topological sort on the graph in order to help DedupComputations
-  // and AddOpsRewrite to optimize larger subgraphs starting from the roots with
-  // more inputs.
+  // and AddOpsRewrite to optimize larger subgraphs starting from the roots
+  // with more inputs.
   TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph_));
   GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
 
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index 89046d2bb30..d9ce9f66b7a 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -163,11 +163,10 @@ TEST_F(ArithmeticOptimizerTest, OpDeduppingAssertAndCheckNumerics) {
   EXPECT_EQ(output.node_size(), 5);
   const NodeDef* new_div = node_map.GetNode("div");
   ASSERT_NE(new_div, nullptr);
-  ASSERT_EQ(new_div->input_size(), 4);
+  ASSERT_EQ(new_div->input_size(), 3);
   EXPECT_EQ(new_div->input(0), "check1");
   EXPECT_EQ(new_div->input(1), "check1");
   EXPECT_EQ(new_div->input(2), "^assert1");
-  EXPECT_EQ(new_div->input(3), "^assert1");
 
   auto tensors = EvaluateNodes(output, item.fetch, {{"Placeholder", bool_t}});
   EXPECT_EQ(tensors.size(), 1);
@@ -507,8 +506,8 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsRepeatedAdd) {
   const NodeDef* mul_node = node_map.GetNode(HoistMulName("Add_6"));
   ASSERT_NE(mul_node, nullptr);
   ASSERT_EQ(mul_node->input_size(), 2);
-  EXPECT_EQ(mul_node->input(0), "Placeholder");
-  EXPECT_EQ(mul_node->input(1), HoistAddName("Add_6"));
+  EXPECT_EQ(mul_node->input(0), HoistAddName("Add_6"));
+  EXPECT_EQ(mul_node->input(1), "Placeholder");
 
   const NodeDef* add_6_node = node_map.GetNode(HoistAddName("Add_6"));
   ASSERT_NE(add_6_node, nullptr);
@@ -1578,47 +1577,53 @@ TEST_F(ArithmeticOptimizerTest, RemoveIdentityTransposesThroughChain) {
 }
 
 TEST_F(ArithmeticOptimizerTest, FoldMulToTransposeConv) {
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output inputs = ops::Placeholder(s.WithOpName("inputs"), DT_FLOAT,
-                                   ops::Placeholder::Shape({8, 28, 28, 3}));
-  Output scale = ops::Const(s.WithOpName("scale"), 1.0f / 255.0f, {});
-  Output scaled_inputs =
-      ops::Multiply(s.WithOpName("scaled_inputs"), inputs, scale);
-  Output perm_nhwc_to_nchw =
-      ops::Const(s.WithOpName("perm_nhwc_to_nchw"), {0, 3, 1, 2}, {4});
-  Output inputs_nchw = ops::Transpose(s.WithOpName("inputs_nchw"),
-                                      scaled_inputs, perm_nhwc_to_nchw);
-  Output weights = ops::Const(s.WithOpName("weights"),
-                              Input::Initializer(127.0f, {5, 5, 3, 16}));
-  Output conv =
-      ops::Conv2D(s.WithOpName("conv"), inputs_nchw, weights, {1, 1, 1, 1},
-                  "VALID", ops::Conv2D::DataFormat("NCHW"));
-  Output outputs = ops::Identity(s.WithOpName("outputs"), conv);
+  for (bool swap_inputs : {false, true}) {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output inputs = ops::Placeholder(s.WithOpName("inputs"), DT_FLOAT,
+                                     ops::Placeholder::Shape({1, 28, 28, 3}));
+    Output scale = ops::Const(s.WithOpName("scale"), 1.0f / 255.0f, {});
+    Output scaled_inputs = ops::Multiply(s.WithOpName("scaled_inputs"),
+                                         swap_inputs ? scale : inputs,
+                                         swap_inputs ? inputs : scale);
+    Output perm_nhwc_to_nchw =
+        ops::Const(s.WithOpName("perm_nhwc_to_nchw"), {0, 3, 1, 2}, {4});
+    Output inputs_nchw = ops::Transpose(s.WithOpName("inputs_nchw"),
+                                        scaled_inputs, perm_nhwc_to_nchw);
+    Output weights = ops::Const(s.WithOpName("weights"),
+                                Input::Initializer(127.0f, {5, 5, 3, 4}));
+    Output conv =
+        ops::Conv2D(s.WithOpName("conv"), inputs_nchw, weights, {1, 1, 1, 1},
+                    "VALID", ops::Conv2D::DataFormat("NCHW"));
+    Output outputs = ops::Identity(s.WithOpName("outputs"), conv);
 
-  GrapplerItem item;
-  item.fetch = {"outputs"};
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+    GrapplerItem item;
+    item.fetch = {"outputs"};
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  GraphDef output;
-  ArithmeticOptimizer optimizer;
-  EnableOnlyFoldMultipleIntoConv(&optimizer);
-  OptimizeTwiceAndPrune(&optimizer, &item, &output);
+    //    LOG(INFO) << "Before:\n" << item.graph.DebugString();
+    GraphDef output;
+    ArithmeticOptimizer optimizer;
+    EnableOnlyFoldMultipleIntoConv(&optimizer);
+    OptimizeTwiceAndPrune(&optimizer, &item, &output);
 
-  NodeMap node_map(&output);
+    //    LOG(INFO) << "After:\n"  << output.DebugString();
+    NodeMap node_map(&output);
+    // `conv` is now a folded convolution with scaled weights.
+    const NodeDef* folded_conv = node_map.GetNode(conv.node()->name());
+    ASSERT_NE(folded_conv, nullptr);
 
-  // `conv` is now a folded convolution with scaled weights.
-  const NodeDef* folded_conv = node_map.GetNode(conv.node()->name());
-  ASSERT_NE(folded_conv, nullptr);
+    const NodeDef* folded_conv_weights =
+        node_map.GetNode(folded_conv->input(1));
+    ASSERT_NE(folded_conv_weights, nullptr);
+    EXPECT_EQ(folded_conv_weights->op(), "Mul");
 
-  const NodeDef* folded_conv_weights = node_map.GetNode(folded_conv->input(1));
-  ASSERT_NE(folded_conv_weights, nullptr);
-  EXPECT_EQ(folded_conv_weights->op(), "Mul");
-
-  // Its input should be a transpose of `inputs`.
-  const NodeDef* transpose = node_map.GetNode(NodeName(folded_conv->input(0)));
-  ASSERT_NE(transpose, nullptr);
-  ASSERT_EQ(transpose->input_size(), 2);
-  EXPECT_EQ(transpose->input(0), "inputs");
+    // Its input should be a transpose of `inputs`.
+    const NodeDef* transpose =
+        node_map.GetNode(NodeName(folded_conv->input(0)));
+    ASSERT_NE(transpose, nullptr);
+    ASSERT_EQ(transpose->input_size(), 2);
+    EXPECT_EQ(transpose->input(0), "inputs");
+  }
 }
 
 TEST_F(ArithmeticOptimizerTest, NotFoldMulAcrossPreservedTranspose) {
@@ -1921,8 +1926,8 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewriteAddOpsOfIdenticalShape) {
   auto a = ops::Variable(s.WithOpName("a"), {2, 2}, DT_FLOAT);
   auto b = ops::Variable(s.WithOpName("b"), {2, 2}, DT_FLOAT);
   auto c = ops::Variable(s.WithOpName("c"), {2, 2}, DT_FLOAT);
-  auto add_ab = ops::Add(sx.WithOpName("Add_ab"), a, b);
-  auto add_abc = ops::Add(sy.WithOpName("Add_abc"), add_ab, c);
+  auto add_bc = ops::Add(sx.WithOpName("Add_bc"), b, c);
+  auto add_abc = ops::Add(sy.WithOpName("Add_abc"), a, add_bc);
 
   auto outputs = ops::Identity(s.WithOpName("outputs"), add_abc);
 
@@ -1948,9 +1953,9 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewriteAddOpsOfIdenticalShape) {
   //
   //     +
   //    / \
-  //   +   c      -->    AddN(a, b, c)
-  //  / \
-  // a   b
+  //   a   +         -->    AddN(a, b, c)
+  //      / \
+  //     b   c
   EXPECT_EQ(output.node_size(), 5);
 
   NodeMap node_map(&output);
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
index dce8914026b..862401ba6ac 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
@@ -22,7 +22,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 // Needed for CUDA_VERSION macro.
-#include "cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cuda.h"
 #endif  // GOOGLE_CUDA
 
 namespace tensorflow {
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index bd195f2cfa1..da98356446b 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -3404,7 +3404,6 @@ Status ConstantFolding::Optimize(Cluster* cluster, const GrapplerItem& item,
     TF_RETURN_IF_ERROR(
         RunOptimizationPass(cluster, item_to_optimize, optimized_graph));
   } while (graph_modified_ || optimized_graph->node_size() != node_count);
-  TF_RETURN_IF_ERROR(CompressConstants(optimized_graph));
   *optimized_graph->mutable_library() = item.graph.library();
   *optimized_graph->mutable_versions() = item.graph.versions();
 
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 11c362c0360..2b64f3ac04d 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -3840,55 +3840,6 @@ TEST_F(ConstantFoldingTest, BitcastDenormalFloats) {
   test::ExpectTensorEqual<int64>(tensors[0], tensors_expected[0]);
 }
 
-TEST_F(ConstantFoldingTest, CompressConstants) {
-  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
-  Tensor zeros_t(DT_FLOAT, TensorShape({64}));
-  Tensor ones_t(DT_FLOAT, TensorShape({64}));
-  for (int i = 0; i < 64; ++i) {
-    zeros_t.flat<float>()(i) = 0.0f;
-    ones_t.flat<float>()(i) = 1.0f;
-  }
-  Output zeros = ops::Const(scope.WithOpName("zeros"), zeros_t);
-  Output host_ones = ops::Const(scope.WithOpName("host_ones"), ones_t);
-  GrapplerItem item;
-  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
-  ASSERT_EQ(item.graph.node(1).name(), "host_ones");
-  // There is not C++ api for HostConst, so we manually change the node type
-  // here.
-  item.graph.mutable_node(1)->set_op("HostConst");
-  item.fetch = {"zeros", "host_ones"};
-  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, {});
-
-  ConstantFolding optimizer(/*cpu_device=*/nullptr);
-  GraphDef output;
-  TF_EXPECT_OK(optimizer.Optimize(/*cluster=*/nullptr, item, &output));
-
-  {
-    ASSERT_EQ(output.node_size(), 2);
-    const NodeDef& node = output.node(0);
-    EXPECT_EQ(node.name(), "zeros");
-    EXPECT_EQ(node.op(), "Const");
-    const TensorProto& zeroes_t = node.attr().at("value").tensor();
-    EXPECT_EQ(zeroes_t.float_val_size(), 1);
-    EXPECT_EQ(zeroes_t.float_val(0), 0.0f);
-  }
-  {
-    const NodeDef& node = output.node(1);
-    EXPECT_EQ(node.name(), "host_ones");
-    EXPECT_EQ(node.op(), "HostConst");
-    const TensorProto& ones_t = node.attr().at("value").tensor();
-    EXPECT_EQ(ones_t.float_val_size(), 1);
-    EXPECT_EQ(ones_t.float_val(0), 1.0f);
-  }
-
-  auto tensors = EvaluateNodes(output, item.fetch, {});
-  ASSERT_EQ(tensors.size(), 2);
-  ASSERT_EQ(tensors_expected.size(), 2);
-  for (int i = 0; i < 2; ++i) {
-    test::ExpectTensorEqual<float>(tensors[i], tensors_expected[i]);
-  }
-}
-
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index fa69ac74b18..d3858d6b1f2 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -27,6 +27,7 @@ cc_library(
         ":noop_elimination",
         ":parallel_batch",
         ":shuffle_and_repeat_fusion",
+        ":slack",
     ],
 )
 
@@ -206,6 +207,7 @@ cc_library(
         "graph_utils.h",
     ],
     deps = [
+        "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -700,6 +702,45 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "slack",
+    srcs = ["slack.cc"],
+    hdrs = [
+        "slack.h",
+    ],
+    deps = [
+        ":graph_utils",
+        ":optimizer_base",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/grappler:mutable_graph_view",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+    ] + tf_protos_all(),
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "slack_test",
+    srcs = ["slack_test.cc"],
+    deps = [
+        ":function_utils",
+        ":graph_utils",
+        ":slack",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/utils:functions",
+        "//tensorflow/core/kernels/data",
+    ],
+)
+
 cc_library(
     name = "vectorization_utils",
     srcs = ["vectorization_utils.cc"],
diff --git a/tensorflow/core/grappler/optimizers/data/auto_shard.cc b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
index d5d48180234..0652df4dc5e 100644
--- a/tensorflow/core/grappler/optimizers/data/auto_shard.cc
+++ b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
@@ -326,17 +326,16 @@ Status OptimizeGraph(const GrapplerItem& item, int64 num_workers, int64 index,
   // that dataset, in effect giving a piece to each worker. Finally, we remove
   // occurences from randomness from before that point in the graph (e.g. things
   // like ShuffleDataset) to ensure that `shard` returns a sensible result.
-
-  NodeDef sink_node;
-  TF_RETURN_IF_ERROR(graph_utils::FindSinkNode(item.graph, &sink_node));
-  Status s = RecursivelyHandleOp(sink_node, num_workers, index, &flib, &graph,
+  NodeDef* sink_node;
+  TF_RETURN_IF_ERROR(graph_utils::GetFetchNode(graph, item, &sink_node));
+  Status s = RecursivelyHandleOp(*sink_node, num_workers, index, &flib, &graph,
                                  &nodes_to_delete);
 
   if (!s.ok() && errors::IsNotFound(s)) {
     LOG(WARNING) << "Cannot find shardable dataset, adding a shard node at "
                  << "the end of the dataset instead. This may have performance "
                  << "implications.";
-    TF_RETURN_IF_ERROR(AddShardNode(&graph, sink_node, num_workers, index));
+    TF_RETURN_IF_ERROR(AddShardNode(&graph, *sink_node, num_workers, index));
   } else if (!s.ok()) {
     return s;
   }
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
index 4adfeaf60da..758f7786aff 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
@@ -301,36 +301,17 @@ Status EnsureNodeNamesUnique(Graph* g) {
   return Status::OK();
 }
 
-// Tries to find a "sink" node in the graph. A sink node is defined as a node
-// that has at least one input and no outputs. If there are multiple of these,
-// this might return any one of them. This is useful to identify the final
-// Dataset op in the graph but in some cases there might be multiple Identity
-// ops added to the end and this would return the last Identity op in that case.
-Status FindSinkNode(const GraphDef& graph_def, NodeDef* sink_node) {
-  absl::flat_hash_map<string, int> all_node_names;
-  absl::flat_hash_map<string, int> node_input_map;
-  for (int i = 0; i < graph_def.node_size(); ++i) {
-    all_node_names.insert_or_assign(graph_def.node(i).name(), i);
-    node_input_map.insert_or_assign(graph_def.node(i).name(), 0);
+Status GetFetchNode(const MutableGraphView& graph, const GrapplerItem& item,
+                    NodeDef** fetch_node) {
+  if (item.fetch.size() != 1) {
+    return errors::InvalidArgument(
+        "Expected only one fetch node but there were ", item.fetch.size(), ": ",
+        absl::StrJoin(item.fetch, ", "));
   }
-  // Counts how many graph nodes for each input name. Candidate sink
-  // nodes are ones which are inputs into zero nodes.
-  for (const NodeDef& node : graph_def.node()) {
-    for (const string& input_name : node.input()) {
-      node_input_map[input_name]++;
-    }
-  }
-  for (const auto& it : node_input_map) {
-    if (it.second == 0) {
-      const NodeDef& sink_graph_node = graph_def.node(all_node_names[it.first]);
-      if (sink_graph_node.input_size() == 0) {
-        continue;
-      }
-      *sink_node = sink_graph_node;
-      return Status::OK();
-    }
-  }
-  return errors::InvalidArgument("Failed to find a sink node");
+
+  *fetch_node = graph.GetNode(item.fetch.at(0));
+
+  return Status::OK();
 }
 
 }  // namespace graph_utils
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.h b/tensorflow/core/grappler/optimizers/data/graph_utils.h
index 0253b6d90b5..417a8c4ffd1 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -144,8 +145,10 @@ void ConcatAttributeList(const string& attribute_name, const NodeDef& first,
 // and renaming nodes does not mutate any edges.
 Status EnsureNodeNamesUnique(Graph* g);
 
-// Returns the sink node (i.e. last node) in the graph.
-Status FindSinkNode(const GraphDef& graph_def, NodeDef* sink_node);
+// Returns the item's fetch node, if there is exactly one. Otherwise, returns an
+// error.
+Status GetFetchNode(const MutableGraphView& graph, const GrapplerItem& item,
+                    NodeDef** fetch_node);
 
 }  // namespace graph_utils
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
index 8108c84fe4a..93df72ab623 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
@@ -270,37 +270,45 @@ TEST(GraphUtilsTest, EnsureNodeNamesUnique) {
   EXPECT_NE(const_0->name(), const_2->name());
 }
 
-TEST(GraphUtilsTest, TestFindSinkNodeStandard) {
-  GraphDef graph_def;
-  MutableGraphView graph(&graph_def);
+TEST(GraphUtilsTest, TestGetFetchNode) {
+  GrapplerItem item;
+  MutableGraphView graph(&item.graph);
 
-  AddNode("node1", "Identity", {}, {}, &graph);
-  AddNode("node2", "Identity", {"node1"}, {}, &graph);
-  NodeDef* node3 = AddNode("node3", "Identity", {"node2"}, {}, &graph);
+  NodeDef* node1 = AddNode("node1", "Identity", {}, {}, &graph);
+  NodeDef* node2 = AddNode("node2", "Identity", {node1->name()}, {}, &graph);
+  NodeDef* node3 = AddNode("node3", "Identity", {node2->name()}, {}, &graph);
+  item.fetch.push_back(node3->name());
 
-  NodeDef sink_node;
-  TF_EXPECT_OK(FindSinkNode(graph_def, &sink_node));
-  EXPECT_EQ(sink_node.name(), node3->name());
+  NodeDef* sink_node;
+  TF_EXPECT_OK(GetFetchNode(graph, item, &sink_node));
+  EXPECT_EQ(sink_node->name(), node3->name());
 }
 
-TEST(GraphUtilsTest, TestFindSinkNodeNoSingleSink) {
-  GraphDef graph_def;
-  MutableGraphView graph(&graph_def);
+TEST(GraphUtilsTest, TestFindSinkNodeMultipleFetches) {
+  GrapplerItem item;
+  MutableGraphView graph(&item.graph);
 
-  AddNode("node1", "Identity", {}, {}, &graph);
-  AddNode("node2", "Identity", {}, {}, &graph);
+  NodeDef* node1 = AddNode("node1", "Identity", {}, {}, &graph);
+  NodeDef* node2 = AddNode("node2", "Identity", {node1->name()}, {}, &graph);
+  NodeDef* node3 = AddNode("node3", "Identity", {node2->name()}, {}, &graph);
+  item.fetch.push_back(node2->name());
+  item.fetch.push_back(node3->name());
 
-  NodeDef sink_node;
-  Status s = FindSinkNode(graph_def, &sink_node);
+  NodeDef* sink_node;
+  Status s = GetFetchNode(graph, item, &sink_node);
   EXPECT_FALSE(s.ok());
 }
 
-TEST(GraphUtilsTest, TestFindSinkNodeGraphDefEmpty) {
-  GraphDef graph_def;
-  MutableGraphView graph(&graph_def);
+TEST(GraphUtilsTest, TestFindSinkNodeNoFetches) {
+  GrapplerItem item;
+  MutableGraphView graph(&item.graph);
 
-  NodeDef sink_node;
-  Status s = FindSinkNode(graph_def, &sink_node);
+  NodeDef* node1 = AddNode("node1", "Identity", {}, {}, &graph);
+  NodeDef* node2 = AddNode("node2", "Identity", {node1->name()}, {}, &graph);
+  AddNode("node3", "Identity", {node2->name()}, {}, &graph);
+
+  NodeDef* sink_node;
+  Status s = GetFetchNode(graph, item, &sink_node);
   EXPECT_FALSE(s.ok());
 }
 
diff --git a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
index 2a334861c83..9e6b7f2bdef 100644
--- a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
@@ -35,6 +35,31 @@ namespace {
 using ConfigMap =
     std::map<string, tensorflow::RewriterConfig_CustomGraphOptimizer>;
 
+// tf.data optimizations, in the order we want to perform them.
+constexpr std::array<const char*, 14> kTFDataOptimizations = {
+    "noop_elimination",
+    "shuffle_and_repeat_fusion",
+    "map_fusion",
+    "filter_fusion",
+    "filter_with_random_uniform_fusion",
+    "map_and_filter_fusion",
+    "hoist_random_uniform",
+    "map_parallelization",
+    "map_and_batch_fusion",
+    "map_vectorization",
+    "latency_all_edges",
+    "make_sloppy",
+    "parallel_batch",
+    "slack"};
+
+// Standard grappler optimizations, in the order we want to perform them.
+constexpr std::array<const char*, 5> kGrapplerOptimizations = {
+    "pruning",
+    "function",
+    "shape",
+    "arithmetic",
+    "dependency"};
+
 // Parses a list of string optimizer configurations into a map from
 // optimizer name -> rewriter config for that optimizer.
 Status ToConfigMap(
@@ -80,13 +105,12 @@ Status TFDataMetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   GrapplerItem optimized_item = item;
 
   // Perform optimizations in a meaningful order.
-  for (const auto& optimization :
-       {"noop_elimination", "shuffle_and_repeat_fusion", "map_fusion",
-        "filter_fusion", "filter_with_random_uniform_fusion",
-        "map_and_filter_fusion", "hoist_random_uniform", "map_parallelization",
-        "map_and_batch_fusion", "map_vectorization", "latency_all_edges",
-        "make_sloppy", "parallel_batch", "pruning", "function", "shape",
-        "arithmetic", "dependency"}) {
+  for (const auto& optimization : kTFDataOptimizations) {
+    TF_RETURN_IF_ERROR(
+        ApplyOptimization(optimization, cluster, &optimized_item));
+  }
+
+  for (const auto& optimization : kGrapplerOptimizations) {
     TF_RETURN_IF_ERROR(
         ApplyOptimization(optimization, cluster, &optimized_item));
   }
@@ -108,10 +132,18 @@ Status TFDataMetaOptimizer::ApplyOptimization(const string& name,
 
   GraphDef result;
   (*optimizer)->set_deadline_usec(this->deadline_usec());
-  TF_RETURN_IF_ERROR((*optimizer)->Optimize(cluster, *item, &result));
-  item->graph.Swap(&result);
+  Status status = (*optimizer)->Optimize(cluster, *item, &result);
+  if (status.ok()) {
+    // The optimizer succeeded and wrote the optimized graph to result.
+    item->graph.Swap(&result);
+  } else if (errors::IsAborted(status)) {
+    // A status of errors::Aborted just means that the optimizer was a no-op and
+    // did not populate result. Swallow the error status and leave the original
+    // graph in item.
+    status = Status::OK();
+  }
 
-  return Status::OK();
+  return status;
 }
 
 Status TFDataMetaOptimizer::Init(
diff --git a/tensorflow/core/grappler/optimizers/data/rebatch.cc b/tensorflow/core/grappler/optimizers/data/rebatch.cc
index f1a0a345a9f..6e690375d7b 100644
--- a/tensorflow/core/grappler/optimizers/data/rebatch.cc
+++ b/tensorflow/core/grappler/optimizers/data/rebatch.cc
@@ -78,22 +78,27 @@ constexpr std::array<const char*, 17> kPassThroughOps = {
     "WindowDataset"
 };
 
-constexpr std::array<const char*, 3> kFuncDatasetOps = {
+constexpr std::array<const char*, 4> kFuncDatasetOps = {
+    "ExperimentalGroupByWindowDataset",
     "FlatMapDataset",
     "InterleaveDataset",
-    "ParallelInterleaveDatasetV2"
+    "ParallelInterleaveDatasetV2",
 };
 
+const std::map<string, const char*>* kFuncDatasetOpFuncs =
+    new std::map<string, const char*>({
+        {"ExperimentalGroupByWindowDataset", "reduce_func"},
+        {"FlatMapDataset", "f"},
+        {"InterleaveDataset", "f"},
+        {"ParallelInterleaveDatasetV2", "f"},
+    });
+
 constexpr std::array<const char*, 9> kSourceDatasetOps = {
-    "FixedLengthRecordDataset",
-    "FixedLengthRecordDatasetV2",
-    "GeneratorDataset",
-    "RangeDataset",
-    "SparseTensorsSliceDataset",
-    "TensorDataset",
-    "TensorSliceDataset",
-    "TextLineDataset",
-    "TFRecordDataset"
+    "FixedLengthRecordDataset",  "FixedLengthRecordDatasetV2",
+    "GeneratorDataset",          "RangeDataset",
+    "SparseTensorsSliceDataset", "TensorDataset",
+    "TensorSliceDataset",        "TextLineDataset",
+    "TFRecordDataset",
 };
 
 NodeDef* AddCastNode(const string& input, DataType src_t, DataType dst_t,
@@ -225,7 +230,8 @@ Status RecursivelyHandleOp(const NodeDef& node, int64 num_workers,
         RecursivelyHandleOp(*input_node, num_workers, flib, graph));
     TF_RETURN_IF_ERROR(UpdateOutputShapes(node.name(), num_workers, graph));
   } else if (IsDatasetNodeOfType(node, kFuncDatasetOps)) {
-    const string func_name = node.attr().at("f").func().name();
+    const string func_name =
+        node.attr().at(kFuncDatasetOpFuncs->at(node.op())).func().name();
     const FunctionDef* fdef = flib->Find(func_name);
     GrapplerFunctionItem f_item;
     TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(
@@ -280,10 +286,10 @@ Status OptimizeGraph(const GrapplerItem& item, int64 num_workers,
 
   FunctionLibraryDefinition flib(OpRegistry::Global(), item.graph.library());
 
-  NodeDef sink_node;
-  TF_RETURN_IF_ERROR(graph_utils::FindSinkNode(item.graph, &sink_node));
+  NodeDef* sink_node;
+  TF_RETURN_IF_ERROR(graph_utils::GetFetchNode(graph, item, &sink_node));
   TF_RETURN_IF_ERROR(
-      RecursivelyHandleOp(sink_node, num_workers, &flib, &graph));
+      RecursivelyHandleOp(*sink_node, num_workers, &flib, &graph));
   *output->mutable_library() = flib.ToProto();
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/data/slack.cc b/tensorflow/core/grappler/optimizers/data/slack.cc
new file mode 100644
index 00000000000..8096435f629
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/slack.cc
@@ -0,0 +1,100 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/slack.h"
+
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+namespace grappler {
+
+namespace {
+
+constexpr char kRetValOp[] = "_Retval";
+
+}  // namespace
+
+Status Slack::OptimizeAndCollectStats(Cluster* cluster,
+                                      const GrapplerItem& item,
+                                      GraphDef* output,
+                                      OptimizationStats* stats) {
+  if (slack_period_ < 1)
+    return errors::InvalidArgument("Invalid `slack_period` parameter: ",
+                                   slack_period_);
+
+  *output = item.graph;
+  MutableGraphView graph(output);
+  for (const auto& fetch_name : item.fetch) {
+    // If the GrapplerItem is derived from a FunctionDef, we don't optimize it,
+    // because we only want to add slack to the prefetch on the main dataset
+    // pipeline.
+    auto fetch = graph.GetNode(fetch_name);
+    if (fetch == nullptr || fetch->op() == kRetValOp) {
+      // Heuristic: If the fetch nodes are Retval ops, this item is from a
+      // function.
+      return Status::OK();
+    }
+  }
+  if (item.fetch.size() != 1) {
+    return errors::InvalidArgument(
+        "Expected only one fetch node but there were ", item.fetch.size(), ": ",
+        absl::StrJoin(item.fetch, ", "));
+  }
+  // Walk the input pipeline backwards from the fetch node to find the last
+  // PrefetchDataset node in the pipeline.
+  // TODO(rachelim): This doesn't do the right thing when the "final" prefetch
+  // is nested under an interleave or flat_map. Make this work, similar to
+  // `auto_shard.cc` and `rebatch.cc`.
+  NodeDef* dataset_node = graph.GetNode(item.fetch.at(0));
+  while (true) {
+    if (dataset_node->op() == "PrefetchDataset") {
+      if (HasNodeAttr(*dataset_node, "slack_period")) {
+        (*dataset_node->mutable_attr())["slack_period"].set_i(slack_period_);
+      } else {
+        AddNodeAttr("slack_period", slack_period_, dataset_node);
+      }
+      return Status::OK();
+    }
+    if (dataset_node->op() == "Identity" ||
+        (absl::EndsWith(dataset_node->op(), "Dataset") &&
+         dataset_node->input_size() > 0)) {
+      dataset_node = graph_utils::GetInputNode(*dataset_node, graph);
+    } else {
+      break;
+    }
+  }
+  return Status::OK();
+}
+
+void Slack::Feedback(Cluster* cluster, const GrapplerItem& item,
+                     const GraphDef& optimize_output, double result) {
+  // no-op
+}
+
+REGISTER_GRAPH_OPTIMIZER_AS(Slack, "slack");
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/slack.h b/tensorflow/core/grappler/optimizers/data/slack.h
new file mode 100644
index 00000000000..fcdc2e9f48c
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/slack.h
@@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_SLACK_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_SLACK_H_
+
+#include "absl/strings/numbers.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// This optimization sets the slack attr of the terminal PrefetchDataset node in
+// an input pipeline.
+class Slack : public TFDataOptimizerBase {
+ public:
+  Slack() = default;
+  ~Slack() override = default;
+
+  string name() const override { return "slack"; };
+
+  Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    if (!config) return errors::InvalidArgument("Config parameter required.");
+
+    const string& slack_period_param =
+        config->parameter_map().at("slack_period").s();
+    if (!absl::SimpleAtoi(slack_period_param, &slack_period_)) {
+      return errors::InvalidArgument("Invalid `slack_period` parameter: ",
+                                     slack_period_param);
+    }
+    return Status::OK();
+  }
+
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override;
+
+ private:
+  int64 slack_period_ = -1;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_SLACK_H_
diff --git a/tensorflow/core/grappler/optimizers/data/slack_test.cc b/tensorflow/core/grappler/optimizers/data/slack_test.cc
new file mode 100644
index 00000000000..6a6a2c4ee35
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/slack_test.cc
@@ -0,0 +1,176 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/slack.h"
+
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/data/function_utils.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/grappler/utils/functions.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+void SetupGrapplerItem(GrapplerItem *item) {
+  MutableGraphView graph(&item->graph);
+
+  std::vector<std::pair<string, AttrValue>> common_attrs(2);
+  AttrValue shapes_attr;
+  SetAttrValue(std::vector<TensorShape>({{}}), &shapes_attr);
+  common_attrs[0] = std::make_pair("output_shapes", shapes_attr);
+  AttrValue types_attr;
+  SetAttrValue(std::vector<DataType>({DT_INT64}), &types_attr);
+  common_attrs[1] = std::make_pair("output_types", types_attr);
+
+  NodeDef *start_node = graph_utils::AddScalarConstNode<int64>(0, &graph);
+  NodeDef *stop_node = graph_utils::AddScalarConstNode<int64>(10, &graph);
+  NodeDef *step_node = graph_utils::AddScalarConstNode<int64>(1, &graph);
+
+  std::vector<string> range_inputs(3);
+  range_inputs[0] = start_node->name();
+  range_inputs[1] = stop_node->name();
+  range_inputs[2] = step_node->name();
+  NodeDef *range_node = graph_utils::AddNode(
+      "RangeDataset", "RangeDataset", range_inputs, common_attrs, &graph);
+
+  NodeDef *buffer_size_node = graph_utils::AddScalarConstNode<int64>(1, &graph);
+  NodeDef *prefetch_node = graph_utils::AddNode(
+      "PrefetchDataset", "PrefetchDataset",
+      {range_node->name(), buffer_size_node->name()}, common_attrs, &graph);
+  item->fetch.push_back(prefetch_node->name());
+}
+
+struct ParameterizedSlackTest
+    : ::testing::TestWithParam<std::tuple<string, int>> {};
+
+TEST_P(ParameterizedSlackTest, BasicTest) {
+  GrapplerItem item;
+  SetupGrapplerItem(&item);
+
+  Slack optimizer;
+  tensorflow::RewriterConfig_CustomGraphOptimizer config;
+  (*config.mutable_parameter_map())["slack_period"].set_s(
+      std::get<0>(GetParam()));
+  TF_ASSERT_OK(optimizer.Init(&config));
+
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  ASSERT_TRUE(graph_utils::ContainsNodeWithOp("PrefetchDataset", output));
+  NodeDef optimized_prefetch_node =
+      output.node(graph_utils::FindGraphNodeWithOp("PrefetchDataset", output));
+  EXPECT_EQ(optimized_prefetch_node.attr().at("slack_period").i(),
+            std::get<1>(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(DifferentSlackEveryValues, ParameterizedSlackTest,
+                         ::testing::Values(std::make_tuple("1", 1),
+                                           std::make_tuple("8", 8)));
+
+TEST(SlackTest, TestFailWithoutInit) {
+  GrapplerItem item;
+  Slack optimizer;
+  GraphDef output;
+  Status result = optimizer.Optimize(nullptr, item, &output);
+
+  EXPECT_FALSE(result.ok());
+  EXPECT_TRUE(errors::IsInvalidArgument(result));
+}
+
+TEST(SlackTest, TestFailWithInvalidSlackEveryParam) {
+  GrapplerItem item;
+  SetupGrapplerItem(&item);
+
+  Slack optimizer;
+  tensorflow::RewriterConfig_CustomGraphOptimizer config;
+  (*config.mutable_parameter_map())["slack_period"].set_s("0");
+  TF_ASSERT_OK(optimizer.Init(&config));
+
+  GraphDef output;
+  Status result = optimizer.Optimize(nullptr, item, &output);
+
+  EXPECT_FALSE(result.ok());
+  EXPECT_TRUE(errors::IsInvalidArgument(result));
+}
+
+TEST(SlackTest, TestFunctionNotOptimized) {
+  GrapplerFunctionItem item;
+  FunctionDefLibrary lib_def;
+  FunctionDef *fdef = lib_def.add_function();
+  fdef->mutable_signature()->set_name("nested_function");
+  auto *input_arg = fdef->mutable_signature()->add_input_arg();
+  input_arg->set_name("args_0");
+  input_arg->set_type(DT_INT64);
+  auto *output_arg = fdef->mutable_signature()->add_output_arg();
+  output_arg->set_name("identity");
+  output_arg->set_type(DT_VARIANT);
+  fdef->mutable_signature()->set_is_stateful(true);
+
+  AttrValue shapes_attr;
+  SetAttrValue(std::vector<TensorShape>({{}}), &shapes_attr);
+  AttrValue types_attr;
+  SetAttrValue(std::vector<DataType>({DT_INT64}), &types_attr);
+  NodeDef *tensor_dataset_node =
+      function_utils::AddNode("TensorDataset", "TensorDataset", {"args_0"},
+                              {std::make_pair("output_shapes", shapes_attr),
+                               std::make_pair("Toutput_types", types_attr)},
+                              fdef);
+  NodeDef *prefetch_node = function_utils::AddNode(
+      "PrefetchDataset", "PrefetchDataset",
+      {strings::StrCat(tensor_dataset_node->name(), ":handle:0"), "args_0"},
+      {std::make_pair("output_shapes", shapes_attr),
+       std::make_pair("output_types", types_attr)},
+      fdef);
+
+  AttrValue variant_type_attr;
+  SetAttrValue(DT_VARIANT, &variant_type_attr);
+  NodeDef *identity_node = function_utils::AddNode(
+      "Identity", "Identity",
+      {strings::StrCat(prefetch_node->name(), ":handle:0"),
+       strings::StrCat("^", tensor_dataset_node->name())},
+      {std::make_pair("T", variant_type_attr)}, fdef);
+
+  (*fdef->mutable_ret())["identity"] =
+      strings::StrCat(identity_node->name(), ":output:0");
+  (*fdef->mutable_control_ret())[tensor_dataset_node->name()] =
+      tensor_dataset_node->name();
+  fdef->mutable_signature()->add_control_output(tensor_dataset_node->name());
+
+  FunctionLibraryDefinition flib(OpRegistry::Global(), lib_def);
+
+  TF_ASSERT_OK(
+      MakeGrapplerFunctionItem(*fdef, flib, /*graph_def_version=*/27, &item));
+
+  GraphDef output;
+  Slack optimizer;
+  tensorflow::RewriterConfig_CustomGraphOptimizer config;
+  (*config.mutable_parameter_map())["slack_period"].set_s("8");
+  TF_ASSERT_OK(optimizer.Init(&config));
+
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  ASSERT_TRUE(graph_utils::ContainsNodeWithOp("PrefetchDataset", output));
+  NodeDef optimized_prefetch_node =
+      output.node(graph_utils::FindGraphNodeWithOp("PrefetchDataset", output));
+  // Should not set slack for function items.
+  EXPECT_EQ(optimized_prefetch_node.attr().at("slack_period").i(), 0);
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/debug_stripper.cc b/tensorflow/core/grappler/optimizers/debug_stripper.cc
index 800160e6492..250c63a23fb 100644
--- a/tensorflow/core/grappler/optimizers/debug_stripper.cc
+++ b/tensorflow/core/grappler/optimizers/debug_stripper.cc
@@ -28,6 +28,17 @@ namespace grappler {
 
 Status DebugStripper::Optimize(Cluster* cluster, const GrapplerItem& item,
                                GraphDef* output) {
+  bool can_optimize = false;
+  for (const NodeDef& node : item.graph.node()) {
+    if (IsAssert(node) || IsCheckNumerics(node) || IsPrint(node)) {
+      can_optimize = true;
+      break;
+    }
+  }
+  if (!can_optimize) {
+    return errors::Aborted("Nothing to do.");
+  }
+
   *output = item.graph;
   for (NodeDef& node : *output->mutable_node()) {
     if (IsAssert(node)) {
diff --git a/tensorflow/core/grappler/optimizers/debug_stripper_test.cc b/tensorflow/core/grappler/optimizers/debug_stripper_test.cc
index affd2d51c29..bdb11d5d6ad 100644
--- a/tensorflow/core/grappler/optimizers/debug_stripper_test.cc
+++ b/tensorflow/core/grappler/optimizers/debug_stripper_test.cc
@@ -39,8 +39,8 @@ TEST_F(DebugStripperTest, OutputEqualToInput) {
 
   DebugStripper optimizer;
   GraphDef output;
-  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
-  CompareGraphs(item.graph, output);
+  EXPECT_EQ(optimizer.Optimize(nullptr, item, &output),
+            errors::Aborted("Nothing to do."));
 }
 
 TEST_F(DebugStripperTest, StripAssertOnTwoOutputs) {
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index 1788fb97913..3541613a1a7 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -295,6 +295,7 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
     }
     node->set_op("NoOp");
     node->clear_attr();
+    DedupControlInputs(node);
     nodes_to_simplify->PushBack(node_to_idx_[node]);
     return;
   }
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index 630fcdec954..1c2908ee9d5 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -1402,8 +1402,7 @@ Status FunctionOptimizer::Optimize(Cluster*, const GrapplerItem& item,
                                    GraphDef* optimized_graph) {
   // Nothing to do here.
   if (item.graph.library().function_size() == 0) {
-    *optimized_graph = item.graph;
-    return Status::OK();
+    return errors::Aborted("Nothing to do.");
   }
 
   TF_RETURN_IF_ERROR(RunFunctionOptimizerPass(item, optimized_graph));
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer.h b/tensorflow/core/grappler/optimizers/graph_optimizer.h
index 44dfe0de789..2d0b2550396 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer.h
@@ -38,8 +38,13 @@ class GraphOptimizer {
 
   // Routine called to allow an algorithm to propose a rewritten graph
   // for the graph, feeds and fetches in "item" to run more efficiently
-  // on "cluster".
+  // on "cluster". If the returned status is Status::OK() then
+  // *optimized_graph contains the rewritten graph.
   // Returns an error status if it failed to generate a solution.
+  //
+  // A return value of error::Aborted() can be used signal early termination of
+  // the optimizer, e.g. if the optimization turned out to be a no-op. In this
+  // case the content of *optimized_graph is undefined.
   virtual Status Optimize(Cluster* cluster, const GrapplerItem& item,
                           GraphDef* optimized_graph) = 0;
 
diff --git a/tensorflow/core/grappler/optimizers/implementation_selector.cc b/tensorflow/core/grappler/optimizers/implementation_selector.cc
index 7dff0b5745f..d79a9dac939 100644
--- a/tensorflow/core/grappler/optimizers/implementation_selector.cc
+++ b/tensorflow/core/grappler/optimizers/implementation_selector.cc
@@ -187,8 +187,17 @@ Status ImplementationSelector::SelectImplementation(GraphDef* graph) const {
 Status ImplementationSelector::Optimize(Cluster* cluster,
                                         const GrapplerItem& item,
                                         GraphDef* optimized_graph) {
+  auto status = LoadFunctions(item.graph);
+  // Eat up the error from function loading, since this optimizer might run
+  // several times, and might try to run against functions generated by
+  // function_optimizer from previous runs, which will fail due to function
+  // signature mismatch.
+  if (!status.ok()) {
+    LOG(WARNING) << "Skipping optimization due to error while loading function "
+                 << "libraries: " << status;
+    return errors::Aborted("Skipped Optimization");
+  }
   *optimized_graph = item.graph;
-  TF_RETURN_IF_ERROR(LoadFunctions(*optimized_graph));
   return SelectImplementation(optimized_graph);
 }
 
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index cf1e42dfa44..8369db7d463 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -2198,35 +2198,26 @@ Status LayoutOptimizer::Tune(const GrapplerItem& item,
 Status LayoutOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                  GraphDef* output) {
   if (cluster == nullptr) {
-    return errors::InvalidArgument("cluster == nullptr");
+    LOG(WARNING) << "layout optimizer was called with cluster == nullptr";
+    return errors::Aborted("cluster == nullptr.");
+  }
+  if (GetNumGPUs(*cluster) < 1) {
+    return errors::Aborted(
+        "No GPUs found: LayoutOptimizer is currently only tuned for GPU.");
   }
 
-  if (GetNumGPUs(*cluster) < 1) {
-    // LayoutOptimizer is currently only tuned for GPU.
-    *output = item.graph;
-    return Status::OK();
-  }
+  GraphProperties graph_properties(item);
+  TF_RETURN_IF_ERROR(graph_properties.InferStatically(false));
+  GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
 
   virtual_placer_.reset(new VirtualPlacer(cluster->GetDevices()));
   nodes_to_preserve_ = item.NodesToPreserve();
-  GraphProperties graph_properties(item);
-  auto status = graph_properties.InferStatically(false);
-  if (!status.ok()) {
-    VLOG(1) << "Infer shape return status: " << status.ToString();
-    *output = item.graph;
-    return status;
-  }
-  GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
 
   TuningConfig config;
   config.no_gemm = true;
   // TODO(yaozhang): Enable tuning with various TuningConfig choices with
   // the measurement-based estimator.
-  status = Tune(item, graph_properties, config, output);
-  if (!status.ok()) {
-    *output = item.graph;
-  }
-  return status;
+  return Tune(item, graph_properties, config, output);
 }
 
 void LayoutOptimizer::Feedback(Cluster* cluster, const GrapplerItem& item,
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.cc b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
index c9ca9e211b9..94209aabea2 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
@@ -709,6 +709,11 @@ LoopOptimizer::LoopOptimizer(RewriterConfig::Toggle opt_level,
 
 Status LoopOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                GraphDef* optimized_graph) {
+  if (!options_.enable_loop_invariant_node_motion &&
+      !options_.enable_stack_push_removal &&
+      !options_.enable_dead_branch_removal) {
+    return errors::Aborted("Nothing to do.");
+  }
   *optimized_graph = item.graph;
   // Set up helper data structures.
   if (options_.enable_loop_invariant_node_motion) {
@@ -900,9 +905,8 @@ Status LoopOptimizer::RemoveDeadBranches(
   return Status::OK();
 }
 
-void LoopOptimizer::Feedback(Cluster* /*cluster*/, const GrapplerItem& /*item*/,
-                             const GraphDef& /*optimized_graph*/,
-                             double /*result*/) {
+void LoopOptimizer::Feedback(Cluster* cluster, const GrapplerItem& item,
+                             const GraphDef& optimize_output, double result) {
   // Nothing to do for LoopOptimizer.
 }
 
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index e07fe783c23..0148fdbe775 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/remapper.h"
 #include "tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/shape_optimizer.h"
+#include "tensorflow/core/grappler/utils/canonicalizer.h"
 #include "tensorflow/core/grappler/utils/colocation.h"
 #include "tensorflow/core/grappler/utils/functions.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
@@ -98,18 +99,6 @@ uint64 DeadlineMicroSeconds(const RewriterConfig& cfg) {
   }
 }
 
-Status CompressConstants(GraphDef* graph) {
-  for (int i = 0; i < graph->node_size(); ++i) {
-    NodeDef* node = graph->mutable_node(i);
-    if ((IsConstant(*node) || IsHostConstant(*node)) &&
-        HasNodeAttr(*node, "value")) {
-      AttrValue& attr_val = (*node->mutable_attr())["value"];
-      tensor::CompressTensorProtoInPlace(attr_val.mutable_tensor());
-    }
-  }
-  return Status::OK();
-}
-
 // A helper function to decide whether to enable the automatic mixed precision
 // optimizer.
 bool AutoMixedPrecisionEnabled(RewriterConfig::Toggle opt_level) {
@@ -373,6 +362,12 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
   GraphOptimizer* fusion_optimizer = nullptr;
   GraphOptimizer* sa_optimizer = nullptr;
 
+  // Constants in the graph are normally compressed after model_pruner.
+  // Do it here if model pruner is disabled.
+  if (cfg_.disable_model_pruning()) {
+    CompressConstants(optimized_graph);
+  }
+
   for (int iteration = 0; iteration < NumIterations(cfg_); ++iteration) {
     // Don't bother optimizing further if the graph is already tiny.
     if (optimized_graph->node_size() < min_graph_nodes) {
@@ -389,6 +384,7 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
                           reinterpret_cast<uintptr_t>(optimized_graph)),
           *optimized_graph);
     }
+
     for (const auto& optimizer : optimizers) {
       GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
       // Some optimizers can run only once.
@@ -406,6 +402,10 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
       TF_RETURN_IF_ERROR(RunOptimizer(optimizer.get(), cluster, &optimized_item,
                                       optimized_graph, &optimization_result));
 
+      if (iteration == 0 && optimizer->name() == "model_pruner") {
+        CompressConstants(optimized_graph);
+      }
+
       if (VLOG_IS_ON(4)) {
         DumpGraphDefToFile(
             strings::StrCat("after_MetaOptimizer_iteration_", iteration, "_",
@@ -439,17 +439,16 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
   if (fusion_optimizer != nullptr) {
     TF_RETURN_IF_ERROR(RunOptimizer(fusion_optimizer, cluster, &optimized_item,
                                     optimized_graph, &optimization_result));
+    GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
   }
 
   // ScopedAllocatorOptimizer must run last.
   if (sa_optimizer != nullptr) {
     TF_RETURN_IF_ERROR(RunOptimizer(sa_optimizer, cluster, &optimized_item,
                                     optimized_graph, &optimization_result));
+    GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
   }
 
-  // Compress the constants in the final graph.
-  TF_RETURN_IF_ERROR(CompressConstants(optimized_graph));
-
   bool is_optimized = std::find_if(optimization_result.results.begin(),
                                    optimization_result.results.end(),
                                    [](const OptimizerResult& result) {
@@ -487,7 +486,14 @@ Status MetaOptimizer::RunOptimizer(
   string message;
   if (!status.ok()) {
     optimized_graph->Swap(&optimized_item->graph);
-    if (errors::IsDeadlineExceeded(status)) {
+    if (errors::IsAborted(status)) {
+      // By convention we (ab-)use the Aborted error code to signal that the
+      // optimizer returned without performing any changes to the graph.
+      message = strings::StrCat(optimizer->name(),
+                                " did nothing. time = ", duration_ms, "ms.");
+      // Swallow the non-critical error.
+      status = Status::OK();
+    } else if (errors::IsDeadlineExceeded(status)) {
       message =
           strings::StrCat(status.ToString(), ", time = ", duration_ms, "ms.");
       LOG(WARNING) << optimizer->name() << " failed: " << message;
@@ -749,11 +755,7 @@ Status RunMetaOptimizer(const GrapplerItem& item, const ConfigProto& cfg,
   MetaOptimizer optimizer(cpu_device, cfg);
   optimizer.set_deadline_usec(
       DeadlineMicroSeconds(cfg.graph_options().rewrite_options()));
-  Status status = optimizer.Optimize(cluster, item, optimized_graph);
-  if (!status.ok()) {
-    *optimized_graph = item.graph;
-  }
-  return status;
+  return optimizer.Optimize(cluster, item, optimized_graph);
 }
 
 Status OptimizeGraph(
@@ -792,9 +794,7 @@ Status OptimizeGraph(
   }
 
   tensorflow::GraphDef out_graph;
-
   tensorflow::grappler::VirtualCluster cluster(&device_set);
-
   // TODO(nareshmodi): Consider adding and using the more generic GraphOptions
   // proto (which also contain the OptimizerOptions).
   TF_RETURN_IF_ERROR(tensorflow::grappler::RunMetaOptimizer(
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
index da30c7bbd9b..0d7de583972 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -135,7 +135,7 @@ class MetaOptimizerTest : public GrapplerTest {};
 TEST_F(MetaOptimizerTest, RunsCustomOptimizer) {
   TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
   GrapplerItem item;
-  CHECK(fake_input.NextItem(&item));
+  ASSERT_TRUE(fake_input.NextItem(&item));
 
   TestOptimizer::SetOptimized(false);
   ConfigProto config_proto;
@@ -154,7 +154,7 @@ TEST_F(MetaOptimizerTest, RunsCustomOptimizer) {
 TEST_F(MetaOptimizerTest, RunsCustomOptimizerWithParams) {
   TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
   GrapplerItem item;
-  CHECK(fake_input.NextItem(&item));
+  ASSERT_TRUE(fake_input.NextItem(&item));
 
   TestOptimizer::SetOptimized(false);
   ConfigProto config_proto;
@@ -175,7 +175,7 @@ TEST_F(MetaOptimizerTest, RunsCustomOptimizerWithParams) {
 TEST_F(MetaOptimizerTest, RunsCustomOptimizerAndCustomGraphOptimizer) {
   TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
   GrapplerItem item;
-  CHECK(fake_input.NextItem(&item));
+  ASSERT_TRUE(fake_input.NextItem(&item));
 
   TestOptimizer::SetOptimized(false);
   TestGraphOptimizer::SetOptimized(false);
@@ -198,7 +198,7 @@ TEST_F(MetaOptimizerTest, RunsCustomOptimizerAndCustomGraphOptimizer) {
 TEST_F(MetaOptimizerTest, RunOptimizersTwice) {
   TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
   GrapplerItem item;
-  CHECK(fake_input.NextItem(&item));
+  ASSERT_TRUE(fake_input.NextItem(&item));
 
   ConfigProto config_proto;
   auto& rewriter_config =
@@ -215,7 +215,7 @@ TEST_F(MetaOptimizerTest, RunOptimizersTwice) {
 TEST_F(MetaOptimizerTest, RunToggleOptimizersAndCustomGraphOptimizerTwice) {
   TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
   GrapplerItem item;
-  CHECK(fake_input.NextItem(&item));
+  ASSERT_TRUE(fake_input.NextItem(&item));
 
   ConfigProto config_proto;
   auto& rewriter_config =
@@ -693,8 +693,9 @@ class SleepingOptimizer : public CustomGraphOptimizer {
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* optimized_graph) override {
     *optimized_graph = item.graph;
-    optimized_graph->add_node();
     sleep(1);
+    GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
+    optimized_graph->add_node();
     return Status::OK();
   }
 
@@ -707,7 +708,29 @@ REGISTER_GRAPH_OPTIMIZER(SleepingOptimizer);
 TEST_F(MetaOptimizerTest, OptimizerTimesOut) {
   TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
   GrapplerItem item;
-  CHECK(fake_input.NextItem(&item));
+  ASSERT_TRUE(fake_input.NextItem(&item));
+
+  ConfigProto config;
+  RewriterConfig& rewriter_config =
+      *config.mutable_graph_options()->mutable_rewrite_options();
+  rewriter_config.add_optimizers("SleepingOptimizer");
+  rewriter_config.set_min_graph_nodes(-1);
+  rewriter_config.set_meta_optimizer_timeout_ms(500);
+  rewriter_config.set_meta_optimizer_iterations(RewriterConfig::ONE);
+
+  GraphDef output;
+  const Status status =
+      RunMetaOptimizer(item, config, nullptr, nullptr, &output);
+  EXPECT_EQ(status.error_message(), "meta_optimizer exceeded deadline.");
+  // Make sure the graph was reverted to the original regardless of when the
+  // optimizer timed out.
+  CompareGraphs(item.graph, output);
+}
+
+TEST_F(MetaOptimizerTest, MetaOptimizerTimesOut) {
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
+  GrapplerItem item;
+  ASSERT_TRUE(fake_input.NextItem(&item));
 
   ConfigProto config;
   RewriterConfig& rewriter_config =
@@ -721,34 +744,34 @@ TEST_F(MetaOptimizerTest, OptimizerTimesOut) {
   const Status status =
       RunMetaOptimizer(item, config, nullptr, nullptr, &output);
   EXPECT_EQ(status.error_message(), "meta_optimizer exceeded deadline.");
-  // Make sure the graph was reverted to the original regardless of when the
-  // optimizer timed out.
-  CompareGraphs(item.graph, output);
+  // The meta optimizer should manage to finish one iteration.
+  EXPECT_EQ(item.graph.node_size() + 1, output.node_size());
 }
 
 TEST_F(MetaOptimizerTest, OptimizerDoesNotTimeOut) {
   TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
   GrapplerItem item;
-  CHECK(fake_input.NextItem(&item));
+  ASSERT_TRUE(fake_input.NextItem(&item));
 
   ConfigProto config;
   RewriterConfig& rewriter_config =
       *config.mutable_graph_options()->mutable_rewrite_options();
   rewriter_config.add_optimizers("SleepingOptimizer");
   rewriter_config.set_min_graph_nodes(-1);
-  rewriter_config.set_meta_optimizer_timeout_ms(1500);
-  rewriter_config.set_meta_optimizer_iterations(RewriterConfig::ONE);
+  rewriter_config.set_meta_optimizer_timeout_ms(2500);
+  rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
   GraphDef output;
   const Status status =
       RunMetaOptimizer(item, config, nullptr, nullptr, &output);
   TF_EXPECT_OK(status);
-  EXPECT_EQ(item.graph.node_size() + 1, output.node_size());
+  // The meta optimizer should manage to finish two iterations.
+  EXPECT_EQ(item.graph.node_size() + 2, output.node_size());
 }
 
 TEST_F(MetaOptimizerTest, RunPostOptimizationVerifiersOnValidGraph) {
   TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
   GrapplerItem item;
-  CHECK(fake_input.NextItem(&item));
+  ASSERT_TRUE(fake_input.NextItem(&item));
 
   ConfigProto config_proto;
   auto& post_optimization_verifier_config =
@@ -766,7 +789,7 @@ TEST_F(MetaOptimizerTest, RunPostOptimizationVerifiersOnValidGraph) {
 TEST_F(MetaOptimizerTest, RunInterOptimizerVerifiersOnValidGraph) {
   TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
   GrapplerItem item;
-  CHECK(fake_input.NextItem(&item));
+  ASSERT_TRUE(fake_input.NextItem(&item));
 
   ConfigProto config_proto;
   auto& inter_optimizer_verifier_config =
@@ -930,6 +953,59 @@ TEST_F(MetaOptimizerTest, RunInterOptimizerVerifiersOnInvalidGraph) {
       "NodeDef expected inputs 'float' do not match 3 inputs specified"));
 }
 
+TEST_F(MetaOptimizerTest, CompressConstants) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  Tensor zeros_t(DT_FLOAT, TensorShape({64}));
+  Tensor ones_t(DT_FLOAT, TensorShape({64}));
+  for (int i = 0; i < 64; ++i) {
+    zeros_t.flat<float>()(i) = 0.0f;
+    ones_t.flat<float>()(i) = 1.0f;
+  }
+  Output zeros = ops::Const(scope.WithOpName("zeros"), zeros_t);
+  Output host_ones = ops::Const(scope.WithOpName("host_ones"), ones_t);
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+  ASSERT_EQ(item.graph.node(1).name(), "host_ones");
+  // There is not C++ api for HostConst, so we manually change the node type
+  // here.
+  item.graph.mutable_node(1)->set_op("HostConst");
+  item.fetch = {"zeros", "host_ones"};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, {});
+
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
+  rewriter_config.set_min_graph_nodes(-1);
+  MetaOptimizer optimizer(/*cpu_device=*/nullptr, config_proto);
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(/*cluster=*/nullptr, item, &output));
+
+  {
+    ASSERT_EQ(output.node_size(), 2);
+    const NodeDef& node = output.node(0);
+    EXPECT_EQ(node.name(), "zeros");
+    EXPECT_EQ(node.op(), "Const");
+    const TensorProto& zeroes_t = node.attr().at("value").tensor();
+    EXPECT_EQ(zeroes_t.float_val_size(), 1);
+    EXPECT_EQ(zeroes_t.float_val(0), 0.0f);
+  }
+  {
+    const NodeDef& node = output.node(1);
+    EXPECT_EQ(node.name(), "host_ones");
+    EXPECT_EQ(node.op(), "HostConst");
+    const TensorProto& ones_t = node.attr().at("value").tensor();
+    EXPECT_EQ(ones_t.float_val_size(), 1);
+    EXPECT_EQ(ones_t.float_val(0), 1.0f);
+  }
+
+  auto tensors = EvaluateNodes(output, item.fetch, {});
+  ASSERT_EQ(tensors.size(), 2);
+  ASSERT_EQ(tensors_expected.size(), 2);
+  for (int i = 0; i < 2; ++i) {
+    test::ExpectTensorEqual<float>(tensors[i], tensors_expected[i]);
+  }
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/model_pruner.cc b/tensorflow/core/grappler/optimizers/model_pruner.cc
index d2094e27fa3..cbcf9e4d325 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner.cc
@@ -33,8 +33,7 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-bool IsTrivialIdentity(const NodeDef& node,
-                       const MutableGraphView& graph_view) {
+bool IsTrivialIdentity(const NodeDef& node, const GraphView& graph_view) {
   for (const auto input :
        graph_view.GetFanins(node, /*include_controlling_nodes=*/true)) {
     if (input.port_id == Graph::kControlSlot) {
@@ -56,7 +55,7 @@ bool IsTrivialIdentity(const NodeDef& node,
   return true;
 }
 
-bool IsTrivialOp(const NodeDef& node, const MutableGraphView& graph_view) {
+bool IsTrivialOp(const NodeDef& node, const GraphView& graph_view) {
   // Remove the stop gradient nodes since they serve no purpose once the graph
   // is built. Also remove Identity ops.
   if (IsStopGradient(node)) {
@@ -78,7 +77,7 @@ bool IsTrivialOp(const NodeDef& node, const MutableGraphView& graph_view) {
 }
 
 bool RemovalIncreasesEdgeCount(const NodeDef& node,
-                               const MutableGraphView& graph_view) {
+                               const GraphView& graph_view) {
   int in_degree =
       graph_view.NumFanins(node, /*include_controlling_nodes=*/true);
   int out_degree =
@@ -100,7 +99,7 @@ bool IsOutputPortRefValue(const NodeDef& node, int port_id,
   return false;
 }
 
-bool CanRemoveNode(const NodeDef& node, const MutableGraphView& graph_view,
+bool CanRemoveNode(const NodeDef& node, const GraphView& graph_view,
                    const absl::flat_hash_set<string>& function_names,
                    const OpRegistryInterface& op_registry) {
   if (IsNoOp(node) && node.input().empty()) {
@@ -143,7 +142,7 @@ void ForwardInputsInternal(
     const absl::flat_hash_set<const NodeDef*>& nodes_to_delete,
     bool add_as_control, NodeDef* new_node,
     const absl::flat_hash_map<string, const NodeDef*>& optimized_nodes,
-    const MutableGraphView& graph_view) {
+    const GraphView& graph_view) {
   // To speed things up, use the optimized version of the node if
   // available.
   auto itr = optimized_nodes.find(node.name());
@@ -177,7 +176,7 @@ void ForwardInputs(const NodeDef& original_node,
                    const absl::flat_hash_set<const NodeDef*>& nodes_to_delete,
                    NodeDef* new_node,
                    absl::flat_hash_map<string, const NodeDef*>* optimized_nodes,
-                   const MutableGraphView& graph_view) {
+                   const GraphView& graph_view) {
   // Forwards inputs of nodes to be deleted to their respective outputs.
   ForwardInputsInternal(original_node, nodes_to_delete,
                         /*add_as_control=*/false, new_node, *optimized_nodes,
@@ -436,33 +435,38 @@ Status SetTransitiveFaninGraph(const GraphDef& input_graph,
 }
 
 Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
-                             GraphDef* pruned_graph) {
+                             GraphDef* optimized_graph) {
   const std::unordered_set<string> nodes_to_preserve = item.NodesToPreserve();
 
   // Prune all the nodes that won't be executed, ie all the nodes that aren't in
   // the fanin of a fetch node. If fetch nodes aren't specified, we'll assume
   // the whole graph might be executed.
-  GrapplerItem runnable_item;
+  std::unique_ptr<GraphDef> pruned_graph_release;
+  GraphDef* pruned_graph;
   if (!nodes_to_preserve.empty()) {
+    pruned_graph_release.reset(new GraphDef());
+    pruned_graph = pruned_graph_release.get();
+    pruned_graph->mutable_node()->Reserve(item.graph.node_size());
     std::vector<string> terminal_nodes(nodes_to_preserve.begin(),
                                        nodes_to_preserve.end());
     std::sort(terminal_nodes.begin(), terminal_nodes.end());
-    TF_RETURN_IF_ERROR(SetTransitiveFaninGraph(item.graph, &runnable_item.graph,
-                                               terminal_nodes));
+    TF_RETURN_IF_ERROR(
+        SetTransitiveFaninGraph(item.graph, pruned_graph, terminal_nodes));
     bool did_split_identity_n = false;
-    TF_RETURN_IF_ERROR(SplitIdentityNInputs(
-        &runnable_item.graph, terminal_nodes, &did_split_identity_n));
+    TF_RETURN_IF_ERROR(SplitIdentityNInputs(pruned_graph, terminal_nodes,
+                                            &did_split_identity_n));
     if (did_split_identity_n) {
       GraphDef fanin_split_identity_n_graph;
       TF_RETURN_IF_ERROR(SetTransitiveFaninGraph(
-          runnable_item.graph, &fanin_split_identity_n_graph, terminal_nodes));
-      runnable_item.graph.Swap(&fanin_split_identity_n_graph);
+          *pruned_graph, &fanin_split_identity_n_graph, terminal_nodes));
+      pruned_graph->Swap(&fanin_split_identity_n_graph);
     }
+    GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
   } else {
-    runnable_item = item;
+    pruned_graph = const_cast<GraphDef*>(&item.graph);
   }
 
-  MutableGraphView graph_view(&runnable_item.graph);
+  GraphView graph_view(pruned_graph);
   absl::flat_hash_set<string> function_names;
   for (const auto& function : item.graph.library().function()) {
     function_names.insert(function.signature().name());
@@ -471,7 +475,7 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
 
   // Check if we can further prune the graph, by removing the trivial ops.
   absl::flat_hash_set<const NodeDef*> nodes_to_delete;
-  for (auto& node : runnable_item.graph.node()) {
+  for (const auto& node : pruned_graph->node()) {
     if (!IsTrivialOp(node, graph_view)) {
       continue;
     }
@@ -500,22 +504,25 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
     }
   }
 
-  pruned_graph->Clear();
-  *pruned_graph->mutable_library() = item.graph.library();
-  *pruned_graph->mutable_versions() = item.graph.versions();
+  if (nodes_to_delete.empty() && nodes_to_preserve.empty()) {
+    return errors::Aborted("Nothing to do.");
+  }
 
+  optimized_graph->Clear();
+  *optimized_graph->mutable_library() = item.graph.library();
+  *optimized_graph->mutable_versions() = item.graph.versions();
   if (nodes_to_delete.empty()) {
-    pruned_graph->mutable_node()->Swap(runnable_item.graph.mutable_node());
+    optimized_graph->mutable_node()->Swap(pruned_graph->mutable_node());
     return Status::OK();
   }
 
   const bool fetches_are_known = !item.fetch.empty();
-  pruned_graph->mutable_node()->Reserve(runnable_item.graph.node_size());
   absl::flat_hash_map<string, const NodeDef*> optimized_nodes;
-  for (auto& node : runnable_item.graph.node()) {
+  optimized_graph->mutable_node()->Reserve(pruned_graph->node_size());
+  for (const auto& node : pruned_graph->node()) {
     if (!fetches_are_known ||
         nodes_to_delete.find(&node) == nodes_to_delete.end()) {
-      NodeDef* new_node = pruned_graph->add_node();
+      NodeDef* new_node = optimized_graph->add_node();
       *new_node = node;
       new_node->clear_input();
       ForwardInputs(node, nodes_to_delete, new_node, &optimized_nodes,
@@ -524,14 +531,15 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
   }
   VLOG(1) << "Pruned " << nodes_to_delete.size()
           << " nodes from the graph. The graph now contains "
-          << pruned_graph->node_size() << " nodes.";
-  CHECK_LE(pruned_graph->node_size(), item.graph.node_size());
-
+          << optimized_graph->node_size() << " nodes.";
+  if (optimized_graph->node_size() > item.graph.node_size()) {
+    return errors::Internal("Pruning increased graph size.");
+  }
   return Status::OK();
 }
 
 void ModelPruner::Feedback(Cluster* cluster, const GrapplerItem& item,
-                           const GraphDef& pruned_graph, double result) {
+                           const GraphDef& optimized_graph, double result) {
   // Nothing to do for ModelPruner.
 }
 
diff --git a/tensorflow/core/grappler/optimizers/model_pruner.h b/tensorflow/core/grappler/optimizers/model_pruner.h
index 76cc792a454..b6fa5146fcc 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner.h
+++ b/tensorflow/core/grappler/optimizers/model_pruner.h
@@ -32,10 +32,10 @@ class ModelPruner : public GraphOptimizer {
   string name() const override { return "model_pruner"; };
 
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
-                  GraphDef* pruned_graph) override;
+                  GraphDef* optimized_graph) override;
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
-                const GraphDef& pruned_graph, double result) override;
+                const GraphDef& optimized_graph, double result) override;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
index 15563aa807e..c51c5fcfaf5 100644
--- a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
@@ -733,7 +733,6 @@ ScopedAllocatorOptimizer::ScopedAllocatorOptimizer(
 Status ScopedAllocatorOptimizer::Optimize(Cluster* /*cluster*/,
                                           const GrapplerItem& item,
                                           GraphDef* optimized_graph) {
-  *optimized_graph = item.graph;
   // Nodes that cannot be removed from the graph without damaging correctness,
   // typically fetch nodes.
   nodes_to_preserve_ = item.NodesToPreserve();
@@ -742,6 +741,8 @@ Status ScopedAllocatorOptimizer::Optimize(Cluster* /*cluster*/,
   const bool assume_valid_feeds = opt_level_ == RewriterConfig::AGGRESSIVE;
   LOG_WARNING_AND_RETURN_IF_ERROR(
       graph_properties.InferStatically(assume_valid_feeds));
+
+  *optimized_graph = item.graph;
   node_map_.reset(new NodeMap(optimized_graph));
 
   LOG_WARNING_AND_RETURN_IF_ERROR(ScopedAllocatorOptimizer::ProcessGraphDef(
diff --git a/tensorflow/core/grappler/optimizers/shape_optimizer.cc b/tensorflow/core/grappler/optimizers/shape_optimizer.cc
index 08237d7e074..cfcd1c0d08d 100644
--- a/tensorflow/core/grappler/optimizers/shape_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/shape_optimizer.cc
@@ -28,13 +28,43 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+// This optimizer first rewrites Prod(Shape(x)) into Size(x). It then uses
+// symbolic shapes to simplify Div(Size(x), Size(y)) in the case that x and y
+// share symbolic shapes that are unknown but known to be identical, e.g. we can
+// deduce that Div(Size([2,?,2]) Size([1,?,2])) is 2 if the two unknown
+// dimensions are known to be identical. This can be inferred if they share the
+// same symbolic representation (negative integer dimension size).
 Status ShapeOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                 GraphDef* optimized_graph) {
-  *optimized_graph = item.graph;
+  // Do a quick check to determine if we can skip this optimizer.
+  bool can_optimize = false;
+  bool has_div = false;
+  bool has_size = false;
+  bool has_shape = false;
+  bool has_prod = false;
+  for (const NodeDef& node : item.graph.node()) {
+    if (IsShape(node)) {
+      has_shape = true;
+    } else if (IsProd(node)) {
+      has_prod = true;
+    } else if (IsDiv(node)) {
+      has_div = true;
+    } else if (IsSize(node)) {
+      has_size = true;
+    }
+    if ((has_shape && has_prod) || (has_div && has_size)) {
+      can_optimize = true;
+      break;
+    }
+  }
+  if (!can_optimize) {
+    return errors::Aborted("Nothing to do.");
+  }
 
+  *optimized_graph = item.graph;
+  MutableGraphView graph(optimized_graph);
   GraphProperties properties(item);
   bool inferred_properties = false;
-  MutableGraphView graph(optimized_graph);
 
   // The product of all the dimensions in a tensor shape can be expressed more
   // simply as the size of the tensor.
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index 34bbf948c94..acbb81ac23f 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -493,13 +493,26 @@ Status CheckAttrsExist(const NodeDef& node, absl::Span<const string> keys) {
   return Status::OK();
 }
 
-Status IsKernelRegisteredForNode(const NodeDef& node) {
+Status IsKernelRegisteredForNode(
+    absl::string_view node_name, bool has_experimental_debug_info,
+    const NodeDef_ExperimentalDebugInfo& experimental_debug_info,
+    absl::string_view node_op, absl::string_view node_device,
+    AttrSlice node_attrs) {
   DeviceNameUtils::ParsedName parsed_name;
-  if (!DeviceNameUtils::ParseFullName(node.device(), &parsed_name)) {
+  if (!DeviceNameUtils::ParseFullName(node_device, &parsed_name)) {
     return errors::InvalidArgument("Could not parse device name: ",
-                                   node.device());
+                                   node_device);
   }
-  return FindKernelDef(DeviceType(parsed_name.type), node, nullptr, nullptr);
+  return FindKernelDef(DeviceType(parsed_name.type), node_name,
+                       has_experimental_debug_info, experimental_debug_info,
+                       node_op, node_device, node_attrs, nullptr, nullptr);
+}
+
+Status IsKernelRegisteredForNode(const NodeDef& node) {
+  return IsKernelRegisteredForNode(node.name(),
+                                   node.has_experimental_debug_info(),
+                                   node.experimental_debug_info(), node.op(),
+                                   node.device(), AttrSlice(&node.attr()));
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index 6d09376ab4d..700e4319810 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -298,6 +298,11 @@ void PermuteNodesInPlace(GraphDef* graph, std::vector<int>* permutation,
 
 // Returns Status::OK() if a kernel is registered for node.op() on the device
 // type corresponding to node.device().
+Status IsKernelRegisteredForNode(
+    absl::string_view node_name, bool has_experimental_debug_info,
+    const NodeDef_ExperimentalDebugInfo& experimental_debug_info,
+    absl::string_view node_op, absl::string_view node_device,
+    AttrSlice node_attrs);
 Status IsKernelRegisteredForNode(const NodeDef& node);
 
 Status SetTensorValue(DataType dtype, int value, Tensor* tensor);
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index b24f27dde77..2c3ae8b9d96 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -277,3 +277,29 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
+
+cc_library(
+    name = "canonicalizer",
+    srcs = ["canonicalizer.cc"],
+    hdrs = ["canonicalizer.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+    ],
+)
+
+tf_cc_test(
+    name = "canonicalizer_test",
+    size = "small",
+    srcs = ["canonicalizer_test.cc"],
+    deps = [
+        ":canonicalizer",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/core/grappler/utils/canonicalizer.cc b/tensorflow/core/grappler/utils/canonicalizer.cc
new file mode 100644
index 00000000000..a30d97b0f3d
--- /dev/null
+++ b/tensorflow/core/grappler/utils/canonicalizer.cc
@@ -0,0 +1,67 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/canonicalizer.h"
+
+#include <algorithm>
+
+#include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+void CanonicalizeNode(NodeDef* node) {
+  if (node->input_size() < 2) return;
+  // Partition control and regular inputs.
+  int index = 0;
+  for (; index < node->input_size(); ++index) {
+    if (IsControlInput(node->input(index))) {
+      break;
+    }
+  }
+  auto* input = node->mutable_input();
+  // Maybe sort regular inputs.
+  if (IsCommutative(*node) && index > 0) {
+    std::sort(input->begin(), input->begin() + index);
+  }
+  // Sort and dedup control inputs.
+  if (index < node->input_size()) {
+    std::sort(input->begin() + index, input->end());
+    input->erase(std::unique(input->begin() + index, input->end()),
+                 input->end());
+  }
+}
+
+void CanonicalizeGraph(GraphDef* graph) {
+  for (int i = 0; i < graph->node_size(); ++i) {
+    CanonicalizeNode(graph->mutable_node(i));
+  }
+}
+
+void CompressConstants(GraphDef* graph) {
+  for (int i = 0; i < graph->node_size(); ++i) {
+    NodeDef* node = graph->mutable_node(i);
+    if ((IsConstant(*node) || IsHostConstant(*node)) &&
+        HasNodeAttr(*node, "value")) {
+      AttrValue& attr_val = (*node->mutable_attr())["value"];
+      tensor::CompressTensorProtoInPlace(attr_val.mutable_tensor());
+    }
+  }
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/canonicalizer.h b/tensorflow/core/grappler/utils/canonicalizer.h
new file mode 100644
index 00000000000..a913fc25233
--- /dev/null
+++ b/tensorflow/core/grappler/utils/canonicalizer.h
@@ -0,0 +1,45 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_CANONICALIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_CANONICALIZER_H_
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Canonicalizes node by performing the following steps
+//  - sorting control inputs,
+//  - sorting data inputs if the node represents a commutative op.
+void CanonicalizeNode(NodeDef* node);
+
+// Canonicalizes all nodes in graph.
+void CanonicalizeGraph(GraphDef* graph);
+
+// Compresses Const and HostConstant nodes in the graph to the smallest
+// representation possible, either
+//   a) truncated repeated field representation, or
+//   b) raw serialized byte format.
+// Each node is only modified if it is larger than 64 bytes and compression
+// reduces its size by more than 50%.
+void CompressConstants(GraphDef* graph);
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_CANONICALIZER_H_
diff --git a/tensorflow/core/grappler/utils/canonicalizer_test.cc b/tensorflow/core/grappler/utils/canonicalizer_test.cc
new file mode 100644
index 00000000000..2a1ba929068
--- /dev/null
+++ b/tensorflow/core/grappler/utils/canonicalizer_test.cc
@@ -0,0 +1,76 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/canonicalizer.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+NodeDef MakeNode(const string& op) {
+  NodeDef node;
+  node.set_name("node");
+  node.set_op(op);
+  *node.add_input() = "b";
+  *node.add_input() = "a";
+  *node.add_input() = "^z";
+  *node.add_input() = "^y";
+  *node.add_input() = "^x";
+  *node.add_input() = "^z";
+  return node;
+}
+
+void Verify(const NodeDef& node) {
+  EXPECT_EQ(node.name(), "node");
+  ASSERT_EQ(node.input_size(), 5);
+  if (node.op() == "Div") {
+    EXPECT_EQ(node.input(0), "b");
+    EXPECT_EQ(node.input(1), "a");
+  } else {
+    EXPECT_EQ(node.input(0), "a");
+    EXPECT_EQ(node.input(1), "b");
+  }
+  EXPECT_EQ(node.input(2), "^x");
+  EXPECT_EQ(node.input(3), "^y");
+  EXPECT_EQ(node.input(4), "^z");
+}
+
+TEST(CanonicalizeNode, NonCommutative) {
+  NodeDef node = MakeNode("Div");
+  CanonicalizeNode(&node);
+  Verify(node);
+}
+
+TEST(CanonicalizeNode, Commutative) {
+  NodeDef node = MakeNode("Mul");
+  CanonicalizeNode(&node);
+  Verify(node);
+}
+
+TEST(CanonicalizeGraph, Simple) {
+  GraphDef graph;
+  *graph.add_node() = MakeNode("Div");
+  *graph.add_node() = MakeNode("Mul");
+  CanonicalizeGraph(&graph);
+  for (auto node : graph.node()) {
+    Verify(node);
+  }
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 5e4439cc424..db690acb2f7 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -47,7 +47,10 @@ load(
 load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_tests")
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
+load(
+    "//tensorflow/core:platform/default/cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
 load(
     "//tensorflow/core:platform/default/build_config.bzl",
     "tf_kernel_tests_linkstatic",
@@ -941,6 +944,7 @@ cc_library(
         ":edit_distance_op",
         ":extract_image_patches_op",
         ":extract_volume_patches_op",
+        ":fingerprint_op",
         ":gather_nd_op",
         ":gather_op",
         ":guarantee_const_op",
@@ -1030,6 +1034,28 @@ tf_kernel_library(
     deps = ARRAY_DEPS,
 )
 
+tf_kernel_library(
+    name = "fingerprint_op",
+    prefix = "fingerprint_op",
+    deps = ARRAY_DEPS,
+)
+
+tf_cc_test(
+    name = "fingerprint_op_test",
+    size = "small",
+    srcs = ["fingerprint_op_test.cc"],
+    kernels = [":fingerprint_op"],
+    deps = [
+        ":ops_testutil",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "gather_nd_op",
     prefix = "gather_nd_op",
@@ -1583,10 +1609,10 @@ tf_cuda_cc_test(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/stream_executor/cuda:cudnn_plugin",
     ],
 )
 
@@ -1711,6 +1737,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/stream_executor/cuda:cudnn_plugin",
     ],
 )
 
@@ -2561,6 +2588,7 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler/clusters:virtual_cluster",
@@ -5869,7 +5897,6 @@ filegroup(
         "initializable_lookup_table.h",
         "inplace_ops.cc",
         "inplace_ops_functor.h",
-        "logging_ops.h",
         "lookup_table_init_op.h",
         "lookup_table_op.h",
         "lookup_util.h",
@@ -6150,6 +6177,7 @@ filegroup(
 )
 
 ANDROID_TEXTUAL_HDRS = [
+    "eigen_spatial_convolutions-inl.h",
     "gather_nd_op_cpu_impl.h",
     "gemm_functors.h",
     "mirror_pad_op_cpu_impl.h",
diff --git a/tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc b/tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc
index de25610196a..d1ac433035c 100644
--- a/tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc
@@ -30,8 +30,8 @@ void AdjustHueGPU<T>::operator()(GPUDevice* device,
                                  const T* const input, const float* const delta,
                                  T* const output) {
   const auto stream = device->stream();
-  const CudaLaunchConfig config =
-      GetCudaLaunchConfig(number_of_elements, *device);
+  const GpuLaunchConfig config =
+      GetGpuLaunchConfig(number_of_elements, *device);
   const int threads_per_block = config.thread_per_block;
   const int block_count =
       (number_of_elements + threads_per_block - 1) / threads_per_block;
diff --git a/tensorflow/core/kernels/adjust_saturation_op_gpu.cu.cc b/tensorflow/core/kernels/adjust_saturation_op_gpu.cu.cc
index dc032d9cbbd..f9c2806ee7b 100644
--- a/tensorflow/core/kernels/adjust_saturation_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/adjust_saturation_op_gpu.cu.cc
@@ -31,8 +31,8 @@ void AdjustSaturationGPU<T>::operator()(GPUDevice* device,
                                         const float* const scale,
                                         T* const output) {
   const auto stream = device->stream();
-  const CudaLaunchConfig config =
-      GetCudaLaunchConfig(number_of_elements, *device);
+  const GpuLaunchConfig config =
+      GetGpuLaunchConfig(number_of_elements, *device);
   const int threads_per_block = config.thread_per_block;
   const int block_count =
       (number_of_elements + threads_per_block - 1) / threads_per_block;
diff --git a/tensorflow/core/kernels/avgpooling_op.cc b/tensorflow/core/kernels/avgpooling_op.cc
index ba38e1a188f..1cc5a2d8a3e 100644
--- a/tensorflow/core/kernels/avgpooling_op.cc
+++ b/tensorflow/core/kernels/avgpooling_op.cc
@@ -36,10 +36,10 @@ limitations under the License.
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/maxpooling_op_gpu.h"
 #include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 namespace tensorflow {
 
@@ -112,7 +112,7 @@ REGISTER_KERNEL_BUILDER(
     Name("AvgPool").Device(DEVICE_CPU).TypeConstraint<Eigen::half>("T"),
     AvgPoolingOp<CPUDevice, Eigen::half>);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename T>
 class AvgPoolingOp<GPUDevice, T> : public UnaryOp<T> {
  public:
@@ -205,7 +205,7 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("AvgPool").Device(DEVICE_GPU).TypeConstraint<double>("T"),
     AvgPoolingOp<GPUDevice, double>);
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // The operation to compute AvgPool gradients.
 // It takes two inputs:
@@ -368,7 +368,7 @@ TF_CALL_float(REGISTER_CPU_KERNEL);
 TF_CALL_double(REGISTER_CPU_KERNEL);
 TF_CALL_half(REGISTER_CPU_KERNEL);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // A CUDNN based AvgPoolingGrad implementation. It includes the padding as the
 // candidates for the pooling operation.
@@ -577,6 +577,6 @@ REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
                             .HostMemory("orig_input_shape"),
                         AvgPoolingGradOpCustomGPUKernel<Eigen::half>);
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
index 89a98eb6f3c..23cde515b91 100644
--- a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
@@ -90,7 +90,7 @@ bool RunAvePoolBackwardNHWC(const T* const top_diff, const int num,
                             const int pad_l, T* const bottom_diff,
                             const GPUDevice& d) {
   int x_size = num * height * width * channels;
-  CudaLaunchConfig config = GetCudaLaunchConfig(x_size, d);
+  GpuLaunchConfig config = GetCudaLaunchConfig(x_size, d);
   TF_CHECK_OK(CudaLaunchKernel(
       AvePoolBackwardNHWC<T>, config.block_count, config.thread_per_block, 0,
       d.stream(), config.virtual_thread_count, top_diff, num, height, width,
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index d0be9227078..9c7dddc2182 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -520,7 +520,6 @@ class BatchResource : public ResourceBase {
       return;
     }
     FunctionLibraryRuntime::Options opts;
-    opts.step_id = last_task_context->step_id();
     opts.step_container = last_task_context->step_container();
     opts.cancellation_manager = last_task_context->cancellation_manager();
     opts.stats_collector = last_task_context->stats_collector();
diff --git a/tensorflow/core/kernels/batch_matmul_op_real.cc b/tensorflow/core/kernels/batch_matmul_op_real.cc
index 7bc43be66b3..2806e692d87 100644
--- a/tensorflow/core/kernels/batch_matmul_op_real.cc
+++ b/tensorflow/core/kernels/batch_matmul_op_real.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/batch_matmul_op_impl.h"
 
 #if GOOGLE_CUDA
-#include "cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cuda.h"
 #endif  // GOOGLE_CUDA
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/bias_op_gpu.cu.cc b/tensorflow/core/kernels/bias_op_gpu.cu.cc
index d4cd45f9fd2..c00232f5148 100644
--- a/tensorflow/core/kernels/bias_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bias_op_gpu.cu.cc
@@ -81,7 +81,7 @@ void BiasGPU<T>::compute(const GPUDevice& d, const T* input, const T* bias,
   if (total_count == 0) {
     return;
   }
-  CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
+  GpuLaunchConfig config = GetCudaLaunchConfig(total_count, d);
   if (data_format == FORMAT_NHWC) {
     TF_CHECK_OK(CudaLaunchKernel(BiasNHWCKernel<T>, config.block_count,
                                  config.thread_per_block, 0, d.stream(),
@@ -203,7 +203,7 @@ void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
     return;
   }
   static constexpr int32 kWarpSize = 32;
-  CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
+  GpuLaunchConfig config = GetCudaLaunchConfig(total_count, d);
 
   const int max_shared_memory_size = d.sharedMemPerBlock() / 2;
   int32 shared_memory_size = 0;
diff --git a/tensorflow/core/kernels/bucketize_op_gpu.cu.cc b/tensorflow/core/kernels/bucketize_op_gpu.cu.cc
index 64016b887dc..1408df184f4 100644
--- a/tensorflow/core/kernels/bucketize_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bucketize_op_gpu.cu.cc
@@ -92,7 +92,7 @@ struct BucketizeFunctor<GPUDevice, T> {
     }
     TF_RETURN_IF_ERROR(boundaries_array.Finalize());
 
-    CudaLaunchConfig config = GetCudaLaunchConfig(input.size(), d);
+    GpuLaunchConfig config = GetCudaLaunchConfig(input.size(), d);
     int32 shared_mem_size = sizeof(float) * boundaries_vector.size();
     const int32 kMaxSharedMemBytes = 16384;
     if (shared_mem_size < d.sharedMemPerBlock() &&
diff --git a/tensorflow/core/kernels/collective_nccl_reducer_test.cc b/tensorflow/core/kernels/collective_nccl_reducer_test.cc
index 7348019e056..00dfa722b57 100644
--- a/tensorflow/core/kernels/collective_nccl_reducer_test.cc
+++ b/tensorflow/core/kernels/collective_nccl_reducer_test.cc
@@ -95,7 +95,6 @@ class NcclReducerTest : public ::testing::Test {
   void Init(int num_ranks) {
     setenv("NCCL_DEBUG", "INFO", 1 /* replace */);
     setenv("NCCL_LAUNCH_MODE", "PARALLEL", 1 /* replace */);
-    setenv("TF_CPP_VMODULE", "nccl_manager=2", 1 /* replace */);
     InitGPUDevices();
     std::vector<std::unique_ptr<Device>> local_devices;
     std::vector<string> device_names;
diff --git a/tensorflow/core/kernels/compare_and_bitpack_op_gpu.cu.cc b/tensorflow/core/kernels/compare_and_bitpack_op_gpu.cu.cc
index b417b927435..133f7ec38bc 100644
--- a/tensorflow/core/kernels/compare_and_bitpack_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/compare_and_bitpack_op_gpu.cu.cc
@@ -121,7 +121,7 @@ __global__ void CompareAndBitpackKernel<double>(const int size,
       TTypes<uint8>::Matrix output) {                                          \
     const GPUDevice& d = c->eigen_device<GPUDevice>();                         \
     int64 total_count = output.size();                                         \
-    CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);             \
+    GpuLaunchConfig config = GetCudaLaunchConfig(total_count, d);              \
                                                                                \
     TF_CHECK_OK(CudaLaunchKernel(CompareAndBitpackKernel<T>,                   \
                                  config.block_count, config.thread_per_block,  \
diff --git a/tensorflow/core/kernels/conv_2d.h b/tensorflow/core/kernels/conv_2d.h
index b735f78c2e3..22b10ade4db 100644
--- a/tensorflow/core/kernels/conv_2d.h
+++ b/tensorflow/core/kernels/conv_2d.h
@@ -179,42 +179,50 @@ struct MatMulConvFunctor {
 
 // Shuffles a filter tensor from TensorFlow format HWIO to dst_filter_format.
 //
-// Note: Currently OIHW is the only supported destination format. Support for
-// OHWI format will be added in a follow-up change.
+// Note: Currently supports OIHW and OHWI destination formats.
 template <typename Device, typename T, typename IndexType, int NDIMS>
 struct TransformFilter {
   void operator()(const Device& d, FilterTensorFormat dst_filter_format,
                   typename TTypes<T, NDIMS, IndexType>::ConstTensor in,
                   typename TTypes<T, NDIMS, IndexType>::Tensor out) {
+    // NOTE: Source filter format is always HWIO.
+    Eigen::DSizes<IndexType, NDIMS - 2> spatial_dims;
+    for (int i = 0; i < spatial_dims.rank(); ++i) {
+      spatial_dims[i] = in.dimension(i);
+    }
+
     // Merge the spatial dimensions together to speed up the shuffle operation.
     Eigen::DSizes<IndexType, 3> merged_dims;
-    merged_dims[0] = in.dimension(0);  // spatial dimensions
-    for (int i = 1; i < NDIMS - 2; ++i) {
-      merged_dims[0] *= in.dimension(i);
-    }
-    merged_dims[1] = in.dimension(NDIMS - 2);  // input filters
-    merged_dims[2] = in.dimension(NDIMS - 1);  // output filters
-
-    DCHECK(dst_filter_format == FORMAT_OIHW)
-        << "Unsupported destination filter format: "
-        << ToString(dst_filter_format);
-    // Source filter format is FORMAT_HWIO and spatial dimensions HW are merged
-    // in the beginning.
-    Eigen::DSizes<IndexType, 3> shuffling_perm =
-        Eigen::DSizes<IndexType, 3>(2, 1, 0);
+    merged_dims[0] = spatial_dims.TotalSize();  // product of spatial dims [H*W]
+    merged_dims[1] = in.dimension(NDIMS - 2);   // input filters           [I]
+    merged_dims[2] = in.dimension(NDIMS - 1);   // output filters          [O]
 
+    // Shuffle tensor with merged spatial dimensions.
+    Eigen::DSizes<IndexType, 3> shuffling_perm;
+    // Expand shuffled tensor into final dimensions.
     Eigen::DSizes<IndexType, NDIMS> expanded_dims;
-    int out_index = 0;
-    for (int merged_dim = 0; merged_dim < merged_dims.rank(); ++merged_dim) {
-      if (shuffling_perm[merged_dim] == 0) {
-        for (int spatial_dim = 0; spatial_dim < NDIMS - 2; ++spatial_dim) {
-          expanded_dims[out_index++] = in.dimension(spatial_dim);
-        }
-      } else {
-        constexpr int kLastSpatialDim = NDIMS - 3;
-        expanded_dims[out_index++] =
-            in.dimension(kLastSpatialDim + shuffling_perm[merged_dim]);
+
+    if (dst_filter_format == FORMAT_OIHW) {
+      shuffling_perm = Eigen::DSizes<IndexType, 3>(2, 1, 0);
+
+      expanded_dims[0] = merged_dims[2];  // [O]
+      expanded_dims[1] = merged_dims[1];  // [I]
+      for (int i = 0; i < spatial_dims.rank(); ++i) {
+        expanded_dims[2 + i] = spatial_dims[i];
       }
+
+    } else if (dst_filter_format == FORMAT_OHWI) {
+      shuffling_perm = Eigen::DSizes<IndexType, 3>(2, 0, 1);
+
+      expanded_dims[0] = merged_dims[2];          // [O]
+      expanded_dims[NDIMS - 1] = merged_dims[1];  // [I]
+      for (int i = 0; i < spatial_dims.rank(); ++i) {
+        expanded_dims[1 + i] = spatial_dims[i];
+      }
+
+    } else {
+      DCHECK(false) << "Unsupported destination filter format: "
+                    << ToString(dst_filter_format);
     }
 
     out.device(d) =
diff --git a/tensorflow/core/kernels/conv_2d_gpu.h b/tensorflow/core/kernels/conv_2d_gpu.h
index 820a92b0f09..8d6249d001e 100644
--- a/tensorflow/core/kernels/conv_2d_gpu.h
+++ b/tensorflow/core/kernels/conv_2d_gpu.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include <limits>
 #include <utility>
 
-#include "cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cuda.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/lib/math/math_util.h"
@@ -432,15 +432,24 @@ struct TransformFilter<GPUDevice, T, int, NDIMS> {
     }
     combined_dims[1] = in.dimension(NDIMS - 2);  // input filters
     combined_dims[2] = in.dimension(NDIMS - 1);  // output filters
-    CudaLaunchConfig config = GetCudaLaunchConfig(out.size(), d);
+    GpuLaunchConfig config = GetCudaLaunchConfig(out.size(), d);
 
-    CHECK(dst_filter_format == FORMAT_OIHW)
-        << "Unsupported output layout: " << ToString(dst_filter_format);
+    if (dst_filter_format == FORMAT_OIHW) {
+      TF_CHECK_OK(CudaLaunchKernel(ShuffleInTensor3Simple<T, 2, 1, 0>,
+                                   config.block_count, config.thread_per_block,
+                                   0, d.stream(), config.virtual_thread_count,
+                                   in.data(), combined_dims, out.data()));
 
-    TF_CHECK_OK(CudaLaunchKernel(ShuffleInTensor3Simple<T, 2, 1, 0>,
-                                 config.block_count, config.thread_per_block, 0,
-                                 d.stream(), config.virtual_thread_count,
-                                 in.data(), combined_dims, out.data()));
+    } else if (dst_filter_format == FORMAT_OHWI) {
+      TF_CHECK_OK(CudaLaunchKernel(ShuffleInTensor3Simple<T, 1, 2, 0>,
+                                   config.block_count, config.thread_per_block,
+                                   0, d.stream(), config.virtual_thread_count,
+                                   in.data(), combined_dims, out.data()));
+
+    } else {
+      LOG(ERROR) << "Unsupported filter format: "
+                 << ToString(dst_filter_format);
+    }
   }
 };
 
@@ -458,7 +467,7 @@ struct ReverseTransformFilter<GPUDevice, T, NDIMS> {
     for (int i = 3; i < NDIMS; ++i) {
       combined_dims[2] *= in.dimension(i);
     }
-    CudaLaunchConfig config = GetCudaLaunchConfig(out.size(), d);
+    GpuLaunchConfig config = GetCudaLaunchConfig(out.size(), d);
     TF_CHECK_OK(CudaLaunchKernel(ShuffleInTensor3Simple<T, 2, 1, 0>,
                                  config.block_count, config.thread_per_block, 0,
                                  d.stream(), config.virtual_thread_count,
@@ -477,7 +486,7 @@ struct PadInput<GPUDevice, T, int, NDIMS> {
                   const std::array<int, NDIMS - 2>& padding_right,
                   typename TTypes<T, NDIMS, int>::Tensor out,
                   TensorFormat format) {
-    CudaLaunchConfig config = GetCudaLaunchConfig(out.size(), d);
+    GpuLaunchConfig config = GetCudaLaunchConfig(out.size(), d);
     Dimension<NDIMS> input_dims;
     for (int i = 0; i < NDIMS; ++i) {
       input_dims[i] = in.dimension(i);
@@ -930,7 +939,7 @@ void RunSwapDimension1And2InTensor3(const GPUDevice& d, const T* input,
         d, input, input_dims, output, kMinDimensionToUseTiles);
   } else {
     int total_element_count = input_dims[0] * input_dims[1] * input_dims[2];
-    CudaLaunchConfig config = GetCudaLaunchConfig(total_element_count, d);
+    GpuLaunchConfig config = GetCudaLaunchConfig(total_element_count, d);
     TF_CHECK_OK(CudaLaunchKernel(ShuffleInTensor3Simple<T, 0, 2, 1, conjugate>,
                                  config.block_count, config.thread_per_block, 0,
                                  d.stream(), config.virtual_thread_count, input,
@@ -963,7 +972,7 @@ struct SwapDimension0And2InTensor3<GPUDevice, T, conjugate> {
                                static_cast<int>(combined_dims[1]),
                                static_cast<int>(combined_dims[2])};
     size_t total_size = combined_dims[0] * combined_dims[1] * combined_dims[2];
-    CudaLaunchConfig config = GetCudaLaunchConfig(total_size, d);
+    GpuLaunchConfig config = GetCudaLaunchConfig(total_size, d);
     TF_CHECK_OK(CudaLaunchKernel(ShuffleInTensor3Simple<T, 2, 1, 0, conjugate>,
                                  config.block_count, config.thread_per_block, 0,
                                  d.stream(), config.virtual_thread_count, in,
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index ec54ece9d7c..cb879e7226a 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/conv_ops.h"
 
 #include <string.h>
+
 #include <map>
 #include <vector>
 
@@ -561,6 +562,15 @@ template struct LaunchConv2DOp<CPUDevice, float>;
 template struct LaunchConv2DOp<CPUDevice, double>;
 
 #if GOOGLE_CUDA
+// Returns true if the given StreamExecutor is for a Volta or newer nvidia GPU.
+bool IsVoltaOrLater(const se::StreamExecutor& stream_exec) {
+  int major, minor;
+  CHECK(stream_exec  // Crash OK
+            .GetDeviceDescription()
+            .cuda_compute_capability(&major, &minor));
+  return major >= 7;
+}
+
 int64 GetDnnWorkspaceLimit(const string& envvar_in_mb,
                            int64 default_value_in_bytes) {
   const char* workspace_limit_in_mb_str = getenv(envvar_in_mb.c_str());
@@ -676,6 +686,24 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
     return;
   }
 
+  // Tensor Core (NVIDIA Volta+ GPUs) supports efficient convolution with fp16
+  // in NHWC data layout. In all other configurations it's more efficient to
+  // run computation in NCHW data format.
+  const bool compute_in_nhwc =
+      DataTypeToEnum<T>::value == DT_HALF && IsVoltaOrLater(*stream->parent());
+
+  // We only do one directional conversion: NHWC->NCHW. We never convert in the
+  // other direction. Grappler layout optimizer selects preferred layout and
+  // adds necessary annotations to the graph.
+  // TODO(ezhulenev): Convert in other direction for fp16?
+  const TensorFormat compute_data_format =
+      (compute_in_nhwc && data_format == FORMAT_NHWC) ? FORMAT_NHWC
+                                                      : FORMAT_NCHW;
+
+  VLOG(3) << "Compute Conv2D with cuDNN:"
+          << " data_format=" << ToString(data_format)
+          << " compute_data_format=" << ToString(compute_data_format);
+
   const int64 out_batch = GetTensorDim(*output, data_format, 'N');
   const int64 out_rows = GetTensorDim(*output, data_format, 'H');
   const int64 out_cols = GetTensorDim(*output, data_format, 'W');
@@ -708,6 +736,11 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
     // cuDNN only supports padding the same amount on the left and right sides,
     // and on the top and bottom sides. So we manually create a new padded
     // input tensor such that we can pass it to cuDNN.
+    VLOG(4) << "Pad input tensor:"
+            << " padding_top=" << padding_top
+            << " padding_bottom=" << padding_bottom
+            << " padding_left=" << padding_left
+            << " padding_right=" << padding_right;
 
     // TODO(reedwm): In some cases, we can avoid an allocation even if the two
     // padding sides are different. For example, if the input is 2x2, the filter
@@ -750,8 +783,9 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
     in_cols = new_in_cols;
   }
 
-  if (data_format == FORMAT_NHWC) {
-    // Convert the input tensor from NHWC to NCHW.
+  if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
+    VLOG(4) << "Convert the input tensor from NHWC to NCHW.";
+
     TensorShape nchw_shape =
         ShapeFromFormat(FORMAT_NCHW, in_batch, in_rows, in_cols, in_depths);
     if (in_depths > 1) {
@@ -767,28 +801,48 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
       // If depth <= 1, then just reshape.
       CHECK(input.CopyFrom(input, nchw_shape));
     }
+  } else {
+    CHECK(data_format == compute_data_format)  // Crash OK
+        << "Illegal data and compute format pair:"
+        << " data_format=" << ToString(data_format)
+        << " compute_data_format=" << ToString(compute_data_format);
   }
 
   CHECK(common_padding_rows >= 0 && common_padding_cols >= 0)  // Crash OK
       << "Negative row or col paddings: (" << common_padding_rows << ", "
       << common_padding_cols << ")";
+
+  constexpr auto kComputeInNHWC =
+      std::make_tuple(se::dnn::DataLayout::kBatchYXDepth,
+                      se::dnn::FilterLayout::kOutputYXInput);
+  constexpr auto kComputeInNCHW =
+      std::make_tuple(se::dnn::DataLayout::kBatchDepthYX,
+                      se::dnn::FilterLayout::kOutputInputYX);
+
+  se::dnn::DataLayout compute_data_layout;
+  se::dnn::FilterLayout filter_layout;
+
+  std::tie(compute_data_layout, filter_layout) =
+      compute_data_format == FORMAT_NHWC ? kComputeInNHWC : kComputeInNCHW;
+
   se::dnn::BatchDescriptor input_desc;
   input_desc.set_count(in_batch)
       .set_feature_map_count(in_depths)
       .set_height(in_rows)
       .set_width(in_cols)
-      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+      .set_layout(compute_data_layout);
   se::dnn::BatchDescriptor output_desc;
   output_desc.set_count(out_batch)
       .set_height(out_rows)
       .set_width(out_cols)
       .set_feature_map_count(out_depths)
-      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+      .set_layout(compute_data_layout);
   se::dnn::FilterDescriptor filter_desc;
   filter_desc.set_input_filter_height(patch_rows)
       .set_input_filter_width(patch_cols)
       .set_input_feature_map_count(patch_depths)
-      .set_output_feature_map_count(filter.dim_size(3));
+      .set_output_feature_map_count(filter.dim_size(3))
+      .set_layout(filter_layout);
   se::dnn::ConvolutionDescriptor conv_desc;
   conv_desc.set_vertical_dilation_rate(row_dilation)
       .set_horizontal_dilation_rate(col_dilation)
@@ -799,22 +853,44 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
       .set_group_count(in_depths / patch_depths);
 
   Tensor transformed_filter;
-  OP_REQUIRES_OK(ctx, ctx->allocate_temp(
-                          DataTypeToEnum<T>::value,
-                          TensorShape({filter.dim_size(3), filter.dim_size(2),
-                                       filter.dim_size(0), filter.dim_size(1)}),
-                          &transformed_filter));
-  functor::TransformFilter<GPUDevice, T, int, 4>()(
-      ctx->eigen_device<GPUDevice>(), FORMAT_OIHW,
-      To32Bit(filter.tensor<T, 4>()),
-      To32Bit(transformed_filter.tensor<T, 4>()));
+
+  const auto transform_filter = [&](FilterTensorFormat dst_format) -> Status {
+    VLOG(4) << "Transform filter tensor from " << ToString(FORMAT_HWIO)
+            << " to " << ToString(dst_format);
+
+    TensorShape dst_shape =
+        dst_format == FORMAT_OIHW
+            ? TensorShape({filter.dim_size(3), filter.dim_size(2),
+                           filter.dim_size(0), filter.dim_size(1)})
+            : TensorShape({filter.dim_size(3), filter.dim_size(0),
+                           filter.dim_size(1), filter.dim_size(2)});
+
+    TF_RETURN_IF_ERROR(ctx->allocate_temp(DataTypeToEnum<T>::value, dst_shape,
+                                          &transformed_filter));
+    functor::TransformFilter<GPUDevice, T, int, 4>()(
+        ctx->eigen_device<GPUDevice>(), dst_format,
+        To32Bit(filter.tensor<T, 4>()),
+        To32Bit(transformed_filter.tensor<T, 4>()));
+
+    return Status::OK();
+  };
+
+  if (compute_data_format == FORMAT_NCHW) {
+    OP_REQUIRES_OK(ctx, transform_filter(FORMAT_OIHW));
+  } else if (compute_data_format == FORMAT_NHWC) {
+    OP_REQUIRES_OK(ctx, transform_filter(FORMAT_OHWI));
+  } else {
+    ctx->SetStatus(errors::InvalidArgument("Invalid compute data format: ",
+                                           ToString(compute_data_format)));
+    return;
+  }
 
   Tensor transformed_output;
-  if (data_format == FORMAT_NHWC) {
-    // Only allocate temporary memory when a layout transformation is needed.
+  if (data_format != compute_data_format) {
+    VLOG(4) << "Allocate temporary memory for output in compute data format";
     OP_REQUIRES_OK(
         ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
-                                ShapeFromFormat(FORMAT_NCHW, out_batch,
+                                ShapeFromFormat(compute_data_format, out_batch,
                                                 out_rows, out_cols, out_depths),
                                 &transformed_output));
   } else {
@@ -842,7 +918,7 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
       in_depths,                // in_depths
       {{in_rows,                // in_rows
         in_cols}},              // in_cols
-      FORMAT_NCHW,              // compute_data_format
+      compute_data_format,      // compute_data_format
       out_depths,               // out_depths
       {{patch_rows,             // filter_rows
         patch_cols,             // filter_cols
@@ -901,6 +977,11 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
     AutoTuneConv::GetInstance()->Insert(conv_parameters, algorithm_config);
   }
 
+  VLOG(4) << "Convolution Algorithm: "
+          << algorithm_config.algorithm()->algo_id();
+  VLOG(4) << "tensor_ops_enabled: "
+          << algorithm_config.algorithm()->tensor_ops_enabled();
+
   DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
   bool cudnn_launch_status =
       stream
@@ -916,8 +997,8 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
         ") filter shape(", filter.shape().DebugString(), ")"));
   }
 
-  // Convert the output tensor back from NCHW to NHWC.
-  if (data_format == FORMAT_NHWC) {
+  if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
+    VLOG(4) << "Convert the output tensor back from NCHW to NHWC.";
     functor::NCHWToNHWC<GPUDevice, T, 4>()(
         ctx->eigen_device<GPUDevice>(),
         const_cast<const Tensor&>(transformed_output).tensor<T, 4>(),
diff --git a/tensorflow/core/kernels/conv_ops_benchmark_test.cc b/tensorflow/core/kernels/conv_ops_benchmark_test.cc
index 259a2f2e570..a03f62b80b1 100644
--- a/tensorflow/core/kernels/conv_ops_benchmark_test.cc
+++ b/tensorflow/core/kernels/conv_ops_benchmark_test.cc
@@ -29,7 +29,7 @@ limitations under the License.
 namespace tensorflow {
 
 ////////////////////////////////////////////////////////////////////////////////
-// Performance benchmarks for the FusedConv2Op.                               //
+// Performance benchmarks for the Conv2DOp and FusedConv2Op.                  //
 ////////////////////////////////////////////////////////////////////////////////
 
 struct Conv2DGraph {
@@ -63,19 +63,27 @@ struct Conv2DWithBatchNormAndActivationGraph {
   Node* activation;
 };
 
+template <typename T>
 static Tensor MakeRandomTensor(const TensorShape& shape) {
-  Tensor tensor(DT_FLOAT, TensorShape(shape));
-  tensor.flat<float>() = tensor.flat<float>().setRandom();
+  Tensor tensor(DataTypeToEnum<T>::value, TensorShape(shape));
+  tensor.flat<T>() = tensor.flat<T>().setRandom();
   return tensor;
 }
 
 // Creates a simple Tensorflow graph with single Conv2D node.
+template <typename T>
 static Conv2DGraph Conv2D(int batch, int height, int width, int in_depth,
-                          int filter_w, int filter_h, int out_depth) {
+                          int filter_w, int filter_h, int out_depth,
+                          TensorFormat data_format = FORMAT_NHWC) {
   Graph* graph = new Graph(OpRegistry::Global());
 
-  Tensor images_t = MakeRandomTensor({batch, height, width, in_depth});
-  Tensor filter_t = MakeRandomTensor({filter_w, filter_h, in_depth, out_depth});
+  Tensor images_t = data_format == FORMAT_NHWC
+                        ? MakeRandomTensor<T>({batch, height, width, in_depth})
+                        : MakeRandomTensor<T>({batch, in_depth, height, width});
+
+  // Filter is always in HWIO.
+  Tensor filter_t =
+      MakeRandomTensor<T>({filter_w, filter_h, in_depth, out_depth});
 
   Node* images = test::graph::Constant(graph, images_t, "images");
   Node* filter = test::graph::Constant(graph, filter_t, "filter");
@@ -84,33 +92,35 @@ static Conv2DGraph Conv2D(int batch, int height, int width, int in_depth,
   TF_CHECK_OK(NodeBuilder(graph->NewName("conv"), "Conv2D")
                   .Input(images)
                   .Input(filter)
-                  .Attr("T", DT_FLOAT)
+                  .Attr("T", DataTypeToEnum<T>::value)
                   .Attr("strides", {1, 1, 1, 1})
                   .Attr("padding", "SAME")
+                  .Attr("data_format", ToString(data_format))
                   .Finalize(graph, &conv2d));
 
   return {graph, conv2d};
 }
 
 // Creates a Tensorflow graph with a Conv2D node followed by BiasAdd.
-static Conv2DWithBiasGraph Conv2DWithBias(int batch, int height, int width,
-                                          int in_depth, int filter_w,
-                                          int filter_h, int out_depth) {
-  Conv2DGraph conv_graph =
-      Conv2D(batch, height, width, in_depth, filter_w, filter_h, out_depth);
+template <typename T>
+static Conv2DWithBiasGraph Conv2DWithBias(
+    int batch, int height, int width, int in_depth, int filter_w, int filter_h,
+    int out_depth, TensorFormat data_format = FORMAT_NHWC) {
+  Conv2DGraph conv_graph = Conv2D<T>(batch, height, width, in_depth, filter_w,
+                                     filter_h, out_depth, data_format);
 
   Graph* graph = conv_graph.graph;
   Node* conv2d = conv_graph.conv2d;
 
-  Tensor bias_t = MakeRandomTensor({out_depth});
+  Tensor bias_t = MakeRandomTensor<T>({out_depth});
   Node* bias = test::graph::Constant(graph, bias_t, "bias");
 
   Node* out;
   TF_CHECK_OK(NodeBuilder(graph->NewName("bias"), "BiasAdd")
                   .Input(conv2d)
                   .Input(bias)
-                  .Attr("T", DT_FLOAT)
-                  .Attr("data_format", "NHWC")
+                  .Attr("T", DataTypeToEnum<T>::value)
+                  .Attr("data_format", ToString(data_format))
                   .Finalize(graph, &out));
 
   return {graph, conv2d, out};
@@ -118,11 +128,14 @@ static Conv2DWithBiasGraph Conv2DWithBias(int batch, int height, int width,
 
 // Creates a Tensorflow graph with a Conv2D node followed by BiasAdd and
 // activation (Relu, Relu6, etc...).
+template <typename T>
 static Conv2DWithBiasAndActivationGraph Conv2DWithBiasAndActivation(
     int batch, int height, int width, int in_depth, int filter_w, int filter_h,
-    int out_depth, const string& activation_type) {
-  Conv2DWithBiasGraph conv_graph = Conv2DWithBias(
-      batch, height, width, in_depth, filter_w, filter_h, out_depth);
+    int out_depth, const string& activation_type,
+    TensorFormat data_format = FORMAT_NHWC) {
+  Conv2DWithBiasGraph conv_graph =
+      Conv2DWithBias<T>(batch, height, width, in_depth, filter_w, filter_h,
+                        out_depth, data_format);
 
   Graph* graph = conv_graph.graph;
   Node* conv2d = conv_graph.conv2d;
@@ -131,27 +144,27 @@ static Conv2DWithBiasAndActivationGraph Conv2DWithBiasAndActivation(
   Node* activation;
   TF_CHECK_OK(NodeBuilder(graph->NewName("activation"), activation_type)
                   .Input(bias)
-                  .Attr("T", DT_FLOAT)
+                  .Attr("T", DataTypeToEnum<T>::value)
                   .Finalize(graph, &activation));
 
   return {graph, conv2d, bias, activation};
 }
 
 // Creates a Tensorflow graph with a Conv2D node followed by FusedBatchNorm.
-static Conv2DWithBatchNormGraph Conv2DWithBatchNorm(int batch, int height,
-                                                    int width, int in_depth,
-                                                    int filter_w, int filter_h,
-                                                    int out_depth) {
-  Conv2DGraph conv_graph =
-      Conv2D(batch, height, width, in_depth, filter_w, filter_h, out_depth);
+template <typename T>
+static Conv2DWithBatchNormGraph Conv2DWithBatchNorm(
+    int batch, int height, int width, int in_depth, int filter_w, int filter_h,
+    int out_depth, TensorFormat data_format = FORMAT_NHWC) {
+  Conv2DGraph conv_graph = Conv2D<T>(batch, height, width, in_depth, filter_w,
+                                     filter_h, out_depth, data_format);
 
   Graph* graph = conv_graph.graph;
   Node* conv2d = conv_graph.conv2d;
 
-  Tensor scale_t = MakeRandomTensor({out_depth});
-  Tensor offset_t = MakeRandomTensor({out_depth});
-  Tensor mean_t = MakeRandomTensor({out_depth});
-  Tensor variance_t = MakeRandomTensor({out_depth});
+  Tensor scale_t = MakeRandomTensor<T>({out_depth});
+  Tensor offset_t = MakeRandomTensor<T>({out_depth});
+  Tensor mean_t = MakeRandomTensor<T>({out_depth});
+  Tensor variance_t = MakeRandomTensor<T>({out_depth});
 
   Node* scale = test::graph::Constant(graph, scale_t, "scale");
   Node* offset = test::graph::Constant(graph, offset_t, "offset");
@@ -165,8 +178,9 @@ static Conv2DWithBatchNormGraph Conv2DWithBatchNorm(int batch, int height,
                   .Input(offset)
                   .Input(mean)
                   .Input(variance)
-                  .Attr("T", DT_FLOAT)
+                  .Attr("T", DataTypeToEnum<T>::value)
                   .Attr("is_training", false)
+                  .Attr("data_format", ToString(data_format))
                   .Finalize(graph, &out));
 
   return {graph, conv2d, out};
@@ -174,11 +188,14 @@ static Conv2DWithBatchNormGraph Conv2DWithBatchNorm(int batch, int height,
 
 // Creates a Tensorflow graph with a Conv2D node followed by FusedBatchNorm and
 // activation (Relu, Relu6, etc...).
+template <typename T>
 static Conv2DWithBatchNormAndActivationGraph Conv2DWithBatchNormAndActivation(
     int batch, int height, int width, int in_depth, int filter_w, int filter_h,
-    int out_depth, const string& activation_type) {
-  Conv2DWithBatchNormGraph conv_graph = Conv2DWithBatchNorm(
-      batch, height, width, in_depth, filter_w, filter_h, out_depth);
+    int out_depth, const string& activation_type,
+    TensorFormat data_format = FORMAT_NHWC) {
+  Conv2DWithBatchNormGraph conv_graph =
+      Conv2DWithBatchNorm<T>(batch, height, width, in_depth, filter_w, filter_h,
+                             out_depth, data_format);
 
   Graph* graph = conv_graph.graph;
   Node* conv2d = conv_graph.conv2d;
@@ -187,7 +204,7 @@ static Conv2DWithBatchNormAndActivationGraph Conv2DWithBatchNormAndActivation(
   Node* activation;
   TF_CHECK_OK(NodeBuilder(graph->NewName("activation"), activation_type)
                   .Input(batch_norm)
-                  .Attr("T", DT_FLOAT)
+                  .Attr("T", DataTypeToEnum<T>::value)
                   .Finalize(graph, &activation));
 
   return {graph, conv2d, batch_norm, activation};
@@ -195,15 +212,22 @@ static Conv2DWithBatchNormAndActivationGraph Conv2DWithBatchNormAndActivation(
 
 // Creates a tensorflow graph with a single FusedConv2D (with BiasAdd) node and
 // fuses into it additional computations (e.g. Relu).
+template <typename T>
 static Graph* FusedConv2DWithBias(int batch, int height, int width,
                                   int in_depth, int filter_w, int filter_h,
                                   int out_depth,
-                                  const std::vector<string>& fused_ops = {}) {
+                                  const std::vector<string>& fused_ops = {},
+                                  TensorFormat data_format = FORMAT_NHWC) {
   Graph* graph = new Graph(OpRegistry::Global());
 
-  Tensor images_t = MakeRandomTensor({batch, height, width, in_depth});
-  Tensor filter_t = MakeRandomTensor({filter_w, filter_h, in_depth, out_depth});
-  Tensor bias_t = MakeRandomTensor({out_depth});
+  Tensor images_t = data_format == FORMAT_NHWC
+                        ? MakeRandomTensor<T>({batch, height, width, in_depth})
+                        : MakeRandomTensor<T>({batch, in_depth, height, width});
+
+  // Filter is always in HWIO.
+  Tensor filter_t =
+      MakeRandomTensor<T>({filter_w, filter_h, in_depth, out_depth});
+  Tensor bias_t = MakeRandomTensor<T>({out_depth});
 
   Node* images = test::graph::Constant(graph, images_t, "images");
   Node* filter = test::graph::Constant(graph, filter_t, "filter");
@@ -217,7 +241,7 @@ static Graph* FusedConv2DWithBias(int batch, int height, int width,
                   .Input(filter)
                   .Attr("num_args", 1)
                   .Input(args)
-                  .Attr("T", DT_FLOAT)
+                  .Attr("T", DataTypeToEnum<T>::value)
                   .Attr("strides", {1, 1, 1, 1})
                   .Attr("padding", "SAME")
                   .Attr("fused_ops", fused_ops)
@@ -228,17 +252,24 @@ static Graph* FusedConv2DWithBias(int batch, int height, int width,
 
 // Creates a tensorflow graph with a single FusedConv2D (with FusedBatchNorm)
 // node and fuses into it additional computations (e.g. Relu).
+template <typename T>
 static Graph* FusedConv2DWithBatchNorm(
     int batch, int height, int width, int in_depth, int filter_w, int filter_h,
-    int out_depth, const std::vector<string>& fused_ops = {}) {
+    int out_depth, const std::vector<string>& fused_ops = {},
+    TensorFormat data_format = FORMAT_NHWC) {
   Graph* graph = new Graph(OpRegistry::Global());
 
-  Tensor images_t = MakeRandomTensor({batch, height, width, in_depth});
-  Tensor filter_t = MakeRandomTensor({filter_w, filter_h, in_depth, out_depth});
-  Tensor scale_t = MakeRandomTensor({out_depth});
-  Tensor offset_t = MakeRandomTensor({out_depth});
-  Tensor mean_t = MakeRandomTensor({out_depth});
-  Tensor variance_t = MakeRandomTensor({out_depth});
+  Tensor images_t = data_format == FORMAT_NHWC
+                        ? MakeRandomTensor<T>({batch, height, width, in_depth})
+                        : MakeRandomTensor<T>({batch, in_depth, height, width});
+
+  // Filter is always in HWIO.
+  Tensor filter_t =
+      MakeRandomTensor<T>({filter_w, filter_h, in_depth, out_depth});
+  Tensor scale_t = MakeRandomTensor<T>({out_depth});
+  Tensor offset_t = MakeRandomTensor<T>({out_depth});
+  Tensor mean_t = MakeRandomTensor<T>({out_depth});
+  Tensor variance_t = MakeRandomTensor<T>({out_depth});
 
   Node* images = test::graph::Constant(graph, images_t, "images");
   Node* filter = test::graph::Constant(graph, filter_t, "filter");
@@ -255,7 +286,7 @@ static Graph* FusedConv2DWithBatchNorm(
                   .Input(filter)
                   .Attr("num_args", 4)
                   .Input(args)
-                  .Attr("T", DT_FLOAT)
+                  .Attr("T", DataTypeToEnum<T>::value)
                   .Attr("strides", {1, 1, 1, 1})
                   .Attr("padding", "SAME")
                   .Attr("fused_ops", fused_ops)
@@ -273,6 +304,10 @@ static Graph* FusedConv2DWithBatchNorm(
 //   FH: filter height
 //   FW: filter width
 
+// -------------------------------------------------------------------------- //
+// Following benchmarks are always using 'float' data type with NHWC layout.
+// -------------------------------------------------------------------------- //
+
 #define BM_SETUP(N, H, W, C, type, LABEL, NAME)                               \
   testing::ItemsProcessed(static_cast<int64>(iters) * (N) * (H) * (W) * (C)); \
   testing::SetLabel(LABEL);
@@ -280,39 +315,41 @@ static Graph* FusedConv2DWithBatchNorm(
 #define BM_NAME(name, type, N, H, W, C, FW, FH, FC) \
   name##_##type##_##N##_##H##_##W##_##C##_##FW##_##FH##_##FC
 
-#define BM_Conv2D(N, H, W, C, FW, FH, FC, type, LABEL)                       \
-  static void BM_NAME(BM_Conv2D, type, N, H, W, C, FW, FH, FC)(int iters) {  \
-    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                               \
-    test::Benchmark(#type, Conv2D(N, H, W, C, FW, FH, FC).graph).Run(iters); \
-  }                                                                          \
+#define BM_Conv2D(N, H, W, C, FW, FH, FC, type, LABEL)                      \
+  static void BM_NAME(BM_Conv2D, type, N, H, W, C, FW, FH, FC)(int iters) { \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                              \
+    test::Benchmark(#type, Conv2D<float>(N, H, W, C, FW, FH, FC).graph)     \
+        .Run(iters);                                                        \
+  }                                                                         \
   BENCHMARK(BM_NAME(BM_Conv2D, type, N, H, W, C, FW, FH, FC));
 
 #define BM_Conv2DWithBias(N, H, W, C, FW, FH, FC, type, LABEL)           \
   static void BM_NAME(BM_Conv2DWithBias, type, N, H, W, C, FW, FH,       \
                       FC)(int iters) {                                   \
     BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                           \
-    test::Benchmark(#type, Conv2DWithBias(N, H, W, C, FW, FH, FC).graph) \
+    test::Benchmark(#type,                                               \
+                    Conv2DWithBias<float>(N, H, W, C, FW, FH, FC).graph) \
         .Run(iters);                                                     \
   }                                                                      \
   BENCHMARK(BM_NAME(BM_Conv2DWithBias, type, N, H, W, C, FW, FH, FC));
 
-#define BM_Conv2DWithBiasAndRelu(N, H, W, C, FW, FH, FC, type, LABEL)      \
-  static void BM_NAME(BM_Conv2DWithBiasAndRelu, type, N, H, W, C, FW, FH,  \
-                      FC)(int iters) {                                     \
-    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                             \
-    test::Benchmark(                                                       \
-        #type,                                                             \
-        Conv2DWithBiasAndActivation(N, H, W, C, FW, FH, FC, "Relu").graph) \
-        .Run(iters);                                                       \
-  }                                                                        \
+#define BM_Conv2DWithBiasAndRelu(N, H, W, C, FW, FH, FC, type, LABEL)         \
+  static void BM_NAME(BM_Conv2DWithBiasAndRelu, type, N, H, W, C, FW, FH,     \
+                      FC)(int iters) {                                        \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                \
+    test::Benchmark(#type, Conv2DWithBiasAndActivation<float>(N, H, W, C, FW, \
+                                                              FH, FC, "Relu") \
+                               .graph)                                        \
+        .Run(iters);                                                          \
+  }                                                                           \
   BENCHMARK(BM_NAME(BM_Conv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, FC));
 
 #define BM_FusedConv2DWithBias(N, H, W, C, FW, FH, FC, type, LABEL)           \
   static void BM_NAME(BM_FusedConv2DWithBias, type, N, H, W, C, FW, FH,       \
                       FC)(int iters) {                                        \
     BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                \
-    test::Benchmark(#type,                                                    \
-                    FusedConv2DWithBias(N, H, W, C, FW, FH, FC, {"BiasAdd"})) \
+    test::Benchmark(#type, FusedConv2DWithBias<float>(N, H, W, C, FW, FH, FC, \
+                                                      {"BiasAdd"}))           \
         .Run(iters);                                                          \
   }                                                                           \
   BENCHMARK(BM_NAME(BM_FusedConv2DWithBias, type, N, H, W, C, FW, FH, FC));
@@ -321,8 +358,8 @@ static Graph* FusedConv2DWithBatchNorm(
   static void BM_NAME(BM_FusedConv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, \
                       FC)(int iters) {                                         \
     BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                 \
-    test::Benchmark(#type, FusedConv2DWithBias(N, H, W, C, FW, FH, FC,         \
-                                               {"BiasAdd", "Relu"}))           \
+    test::Benchmark(#type, FusedConv2DWithBias<float>(N, H, W, C, FW, FH, FC,  \
+                                                      {"BiasAdd", "Relu"}))    \
         .Run(iters);                                                           \
   }                                                                            \
   BENCHMARK(                                                                   \
@@ -332,7 +369,8 @@ static Graph* FusedConv2DWithBatchNorm(
   static void BM_NAME(BM_Conv2DWithBatchNorm, type, N, H, W, C, FW, FH,       \
                       FC)(int iters) {                                        \
     BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                \
-    test::Benchmark(#type, Conv2DWithBatchNorm(N, H, W, C, FW, FH, FC).graph) \
+    test::Benchmark(#type,                                                    \
+                    Conv2DWithBatchNorm<float>(N, H, W, C, FW, FH, FC).graph) \
         .Run(iters);                                                          \
   }                                                                           \
   BENCHMARK(BM_NAME(BM_Conv2DWithBatchNorm, type, N, H, W, C, FW, FH, FC));
@@ -341,8 +379,8 @@ static Graph* FusedConv2DWithBatchNorm(
   static void BM_NAME(BM_Conv2DWithBatchNormAndRelu, type, N, H, W, C, FW, FH, \
                       FC)(int iters) {                                         \
     BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                 \
-    test::Benchmark(#type, Conv2DWithBatchNormAndActivation(N, H, W, C, FW,    \
-                                                            FH, FC, "Relu")    \
+    test::Benchmark(#type, Conv2DWithBatchNormAndActivation<float>(            \
+                               N, H, W, C, FW, FH, FC, "Relu")                 \
                                .graph)                                         \
         .Run(iters);                                                           \
   }                                                                            \
@@ -353,8 +391,8 @@ static Graph* FusedConv2DWithBatchNorm(
   static void BM_NAME(BM_FusedConv2DWithBatchNorm, type, N, H, W, C, FW, FH, \
                       FC)(int iters) {                                       \
     BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                               \
-    test::Benchmark(#type, FusedConv2DWithBatchNorm(N, H, W, C, FW, FH, FC,  \
-                                                    {"FusedBatchNorm"}))     \
+    test::Benchmark(#type, FusedConv2DWithBatchNorm<float>(                  \
+                               N, H, W, C, FW, FH, FC, {"FusedBatchNorm"}))  \
         .Run(iters);                                                         \
   }                                                                          \
   BENCHMARK(BM_NAME(BM_FusedConv2DWithBatchNorm, type, N, H, W, C, FW, FH, FC));
@@ -364,9 +402,9 @@ static Graph* FusedConv2DWithBatchNorm(
   static void BM_NAME(BM_FusedConv2DWithBatchNormAndRelu, type, N, H, W, C,   \
                       FW, FH, FC)(int iters) {                                \
     BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                \
-    test::Benchmark(#type,                                                    \
-                    FusedConv2DWithBatchNorm(N, H, W, C, FW, FH, FC,          \
-                                             {"FusedBatchNorm", "Relu"}))     \
+    test::Benchmark(                                                          \
+        #type, FusedConv2DWithBatchNorm<float>(N, H, W, C, FW, FH, FC,        \
+                                               {"FusedBatchNorm", "Relu"}))   \
         .Run(iters);                                                          \
   }                                                                           \
   BENCHMARK(BM_NAME(BM_FusedConv2DWithBatchNormAndRelu, type, N, H, W, C, FW, \
@@ -500,4 +538,63 @@ BM_FusedConv2DWithBiasAndRelu(16, 32, 32, 128, 3, 3, 1024, gpu, "3x3 /b 16");
 BM_FusedConv2DWithBiasAndRelu(32, 32, 32, 128, 3, 3, 1024, gpu, "3x3 /b 32");
 #endif
 
+// Macro arguments names: --------------------------------------------------- //
+//      T: data type
+// FORMAT: data format (NHWC or NCHW)
+//      N: batch size
+//      H: height
+//      W: width
+//      C: channels
+//     FC: filter count
+//     FH: filter height
+//     FW: filter width
+
+// -------------------------------------------------------------------------- //
+// Following benchmarks are used to compare different data format performance
+// for different data types. They make sense only when CUDA enabled, because on
+// CPU we only support data in NHWC.
+// -------------------------------------------------------------------------- //
+
+#define BM_LONG_NAME(name, type, T, FORMAT, N, H, W, C, FW, FH, FC) \
+  name##_##T##_##FORMAT##_##type##_##N##_##H##_##W##_##C##_##FW##_##FH##_##FC
+
+#define BM_Conv2DFmt(T, FORMAT, N, H, W, C, FW, FH, FC, type)                 \
+  static void BM_LONG_NAME(BM_Conv2D, type, T, FORMAT, N, H, W, C, FW, FH,    \
+                           FC)(int iters) {                                   \
+    BM_SETUP(N, H, W, C, type, "", Conv2D);                                   \
+    test::Benchmark(#type,                                                    \
+                    Conv2D<T>(N, H, W, C, FW, FH, FC, FORMAT_##FORMAT).graph) \
+        .Run(iters);                                                          \
+  }                                                                           \
+  BENCHMARK(BM_LONG_NAME(BM_Conv2D, type, T, FORMAT, N, H, W, C, FW, FH, FC));
+
+#if GOOGLE_CUDA
+using fp32 = float;
+using fp16 = Eigen::half;
+
+// ResNet50-ish convolutions.
+#define BENCHMARK_DTYPE(BATCH, T)                             \
+  BM_Conv2DFmt(T, NHWC, BATCH, 56, 56, 64, 1, 1, 64, gpu);    \
+  BM_Conv2DFmt(T, NHWC, BATCH, 56, 56, 64, 1, 1, 256, gpu);   \
+  BM_Conv2DFmt(T, NHWC, BATCH, 56, 56, 256, 1, 1, 64, gpu);   \
+  BM_Conv2DFmt(T, NHWC, BATCH, 56, 56, 64, 3, 3, 64, gpu);    \
+                                                              \
+  BM_Conv2DFmt(T, NHWC, BATCH, 28, 28, 128, 1, 1, 128, gpu);  \
+  BM_Conv2DFmt(T, NHWC, BATCH, 28, 28, 128, 1, 1, 512, gpu);  \
+  BM_Conv2DFmt(T, NHWC, BATCH, 28, 28, 512, 1, 1, 128, gpu);  \
+  BM_Conv2DFmt(T, NHWC, BATCH, 28, 28, 512, 3, 3, 128, gpu);  \
+                                                              \
+  BM_Conv2DFmt(T, NHWC, BATCH, 14, 14, 256, 1, 1, 256, gpu);  \
+  BM_Conv2DFmt(T, NHWC, BATCH, 14, 14, 256, 1, 1, 1024, gpu); \
+  BM_Conv2DFmt(T, NHWC, BATCH, 14, 14, 1024, 1, 1, 256, gpu); \
+  BM_Conv2DFmt(T, NHWC, BATCH, 14, 14, 256, 3, 3, 256, gpu);
+
+BENCHMARK_DTYPE(32, fp32);
+BENCHMARK_DTYPE(32, fp16);
+
+BENCHMARK_DTYPE(64, fp32);
+BENCHMARK_DTYPE(64, fp16);
+
+#endif  // GOOGLE_CUDA
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops_fused_impl.h b/tensorflow/core/kernels/conv_ops_fused_impl.h
index 89e4e61c26f..2002fedf5db 100644
--- a/tensorflow/core/kernels/conv_ops_fused_impl.h
+++ b/tensorflow/core/kernels/conv_ops_fused_impl.h
@@ -57,7 +57,7 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 
 #if GOOGLE_CUDA
-#include "cuda/include/cudnn.h"
+#include "third_party/gpus/cudnn/cudnn.h"
 #include "tensorflow/core/kernels/conv_ops_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/util/proto/proto_utils.h"
@@ -609,8 +609,7 @@ struct LaunchFusedConv2DOp<GPUDevice, T> {
       auto status = FindBestConvolveAlgorithm<T>(
           conv_parameters, launch, context, stream,
           [&](absl::Span<const tensorflow::AutotuneResult> results) {
-            LogFusedConvAutotuneResults(
-                se::dnn::ConvolutionKind::FORWARD,
+            LogFusedConvForwardAutotuneResults(
                 se::dnn::ToDataType<T>::value, input_desc, filter_desc,
                 output_desc, conv_desc, 1.0, 0.0, dnn_activation_mode,
                 stream->parent(), results);
diff --git a/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc b/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc
index 9cb78aefb94..5e76190a094 100644
--- a/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc
@@ -369,7 +369,7 @@ struct CropAndResize<GPUDevice, T> {
     }
 
     if (total_count > 0) {
-      CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
+      GpuLaunchConfig config = GetCudaLaunchConfig(total_count, d);
       TF_CHECK_OK(CudaLaunchKernel(
           CropAndResizeKernel<T>, config.block_count, config.thread_per_block,
           0, d.stream(), config.virtual_thread_count, image.data(),
@@ -400,12 +400,12 @@ struct CropAndResizeBackpropImage<GPUDevice, T> {
     const GPUDevice& d = context->eigen_device<GPUDevice>();
 
     int total_count;
-    CudaLaunchConfig config;
+    GpuLaunchConfig config;
 
     // Initialize grads_image with all zeros.
     total_count = batch * image_height * image_width * depth;
     if (total_count > 0) {
-      config = GetCudaLaunchConfig(total_count, d);
+      config = GetGpuLaunchConfig(total_count, d);
       TF_CHECK_OK(CudaLaunchKernel(
           SetZero<T>, config.block_count, config.thread_per_block, 0,
           d.stream(), config.virtual_thread_count, grads_image.data()));
@@ -420,7 +420,7 @@ struct CropAndResizeBackpropImage<GPUDevice, T> {
     // Accumulate.
     total_count = num_boxes * crop_height * crop_width * depth;
     if (total_count > 0) {
-      config = GetCudaLaunchConfig(total_count, d);
+      config = GetGpuLaunchConfig(total_count, d);
       TF_CHECK_OK(CudaLaunchKernel(
           CropAndResizeBackpropImageKernel<T>, config.block_count,
           config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
@@ -450,12 +450,12 @@ struct CropAndResizeBackpropBoxes<GPUDevice, T> {
     const int depth = grads.dimension(3);
 
     int total_count;
-    CudaLaunchConfig config;
+    GpuLaunchConfig config;
 
     // Initialize grads_boxes with all zeros.
     total_count = num_boxes * 4;
     if (total_count > 0) {
-      config = GetCudaLaunchConfig(total_count, d);
+      config = GetGpuLaunchConfig(total_count, d);
       TF_CHECK_OK(CudaLaunchKernel(
           SetZero<float>, config.block_count, config.thread_per_block, 0,
           d.stream(), config.virtual_thread_count, grads_boxes.data()));
@@ -464,7 +464,7 @@ struct CropAndResizeBackpropBoxes<GPUDevice, T> {
     // Accumulate.
     total_count = num_boxes * crop_height * crop_width * depth;
     if (total_count > 0) {
-      config = GetCudaLaunchConfig(total_count, d);
+      config = GetGpuLaunchConfig(total_count, d);
       TF_CHECK_OK(CudaLaunchKernel(
           CropAndResizeBackpropBoxesKernel<T>, config.block_count,
           config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
diff --git a/tensorflow/core/kernels/cuda_solvers.cc b/tensorflow/core/kernels/cuda_solvers.cc
index 82d92388d40..6e26cc1d541 100644
--- a/tensorflow/core/kernels/cuda_solvers.cc
+++ b/tensorflow/core/kernels/cuda_solvers.cc
@@ -21,8 +21,8 @@
 #include <unordered_map>
 #include <vector>
 
-#include "cuda/include/cublas_v2.h"
-#include "cuda/include/cusolverDn.h"
+#include "third_party/gpus/cuda/include/cublas_v2.h"
+#include "third_party/gpus/cuda/include/cusolverDn.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
diff --git a/tensorflow/core/kernels/cuda_solvers.h b/tensorflow/core/kernels/cuda_solvers.h
index fa8b4e24155..5b6d6891e29 100644
--- a/tensorflow/core/kernels/cuda_solvers.h
+++ b/tensorflow/core/kernels/cuda_solvers.h
@@ -26,8 +26,8 @@ limitations under the License.
 #include <functional>
 #include <vector>
 
-#include "cuda/include/cublas_v2.h"
-#include "cuda/include/cusolverDn.h"
+#include "third_party/gpus/cuda/include/cublas_v2.h"
+#include "third_party/gpus/cuda/include/cusolverDn.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/core/kernels/cuda_sparse.cc b/tensorflow/core/kernels/cuda_sparse.cc
index 51a4d9cfc9b..3cdf50f896b 100644
--- a/tensorflow/core/kernels/cuda_sparse.cc
+++ b/tensorflow/core/kernels/cuda_sparse.cc
@@ -22,7 +22,7 @@
 #include <utility>
 #include <vector>
 
-#include "cuda/include/cusparse.h"
+#include "third_party/gpus/cuda/include/cusparse.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
diff --git a/tensorflow/core/kernels/cuda_sparse.h b/tensorflow/core/kernels/cuda_sparse.h
index e7c97082177..51ebd6832aa 100644
--- a/tensorflow/core/kernels/cuda_sparse.h
+++ b/tensorflow/core/kernels/cuda_sparse.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include <functional>
 #include <vector>
 
-#include "cuda/include/cusparse.h"
+#include "third_party/gpus/cuda/include/cusparse.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
diff --git a/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc b/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc
index 85a4830baab..8da2dfb1d5a 100644
--- a/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc
@@ -60,7 +60,7 @@ struct UnaryClipOp<GPUDevice, T> {
                   typename TTypes<T>::ConstFlat &in1_flat,
                   typename TTypes<T>::ConstFlat &in2_flat,
                   typename TTypes<T>::Flat &out_flat) const {
-    CudaLaunchConfig config = GetCudaLaunchConfig(in0_flat.size(), d);
+    GpuLaunchConfig config = GetCudaLaunchConfig(in0_flat.size(), d);
 
     TF_CHECK_OK(CudaLaunchKernel(
         UnaryClipCustomKernel<T>, config.block_count, config.thread_per_block,
@@ -76,7 +76,7 @@ struct BinaryRightClipOp<GPUDevice, T> {
                   typename TTypes<T>::ConstFlat &in1_flat,
                   typename TTypes<T>::ConstFlat &in2_flat,
                   typename TTypes<T>::Flat &out_flat) const {
-    CudaLaunchConfig config = GetCudaLaunchConfig(in0_flat.size(), d);
+    GpuLaunchConfig config = GetCudaLaunchConfig(in0_flat.size(), d);
 
     TF_CHECK_OK(CudaLaunchKernel(
         BinaryRightClipCustomKernel<T>, config.block_count,
@@ -92,7 +92,7 @@ struct BinaryLeftClipOp<GPUDevice, T> {
                   typename TTypes<T>::ConstFlat &in1_flat,
                   typename TTypes<T>::ConstFlat &in2_flat,
                   typename TTypes<T>::Flat &out_flat) const {
-    CudaLaunchConfig config = GetCudaLaunchConfig(in0_flat.size(), d);
+    GpuLaunchConfig config = GetCudaLaunchConfig(in0_flat.size(), d);
 
     TF_CHECK_OK(CudaLaunchKernel(
         BinaryLeftClipCustomKernel<T>, config.block_count,
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 287a4f72b20..d35520b3a40 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -218,6 +218,26 @@ tf_kernel_library(
     ],
 )
 
+tf_cc_test(
+    name = "shard_dataset_op_test",
+    size = "small",
+    srcs = ["shard_dataset_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":range_dataset_op",
+        ":shard_dataset_op",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "window_dataset_op",
     srcs = ["window_dataset_op.cc"],
@@ -297,6 +317,29 @@ tf_kernel_library(
     ],
 )
 
+tf_cc_test(
+    name = "filter_dataset_op_test",
+    size = "small",
+    srcs = ["filter_dataset_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":filter_dataset_op",
+        ":iterator_ops",
+        ":tensor_slice_dataset_op",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:cwise_op",
+        "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core/kernels:unique_op",
+    ],
+)
+
 tf_kernel_library(
     name = "filter_by_component_dataset_op",
     srcs = ["filter_by_component_dataset_op.cc"],
@@ -309,6 +352,26 @@ tf_kernel_library(
     ],
 )
 
+tf_cc_test(
+    name = "filter_by_component_dataset_op_test",
+    size = "small",
+    srcs = ["filter_by_component_dataset_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":filter_by_component_dataset_op",
+        ":iterator_ops",
+        ":tensor_slice_dataset_op",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "map_dataset_op",
     srcs = ["map_dataset_op.cc"],
diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index fb799b14852..fdb84e84f7c 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -516,7 +516,6 @@ Status InstantiatedCapturedFunction::Run(IteratorContext* ctx,
   }
 
   FunctionLibraryRuntime::Options f_opts;
-  f_opts.step_id = InstantiatedCapturedFunction::generate_step_id();
   ScopedStepContainer step_container(
       f_opts.step_id, [this](const string& name) {
         lib_->device()->resource_manager()->Cleanup(name).IgnoreError();
@@ -558,7 +557,6 @@ Status InstantiatedCapturedFunction::RunWithBorrowedArgs(
   }
 
   FunctionLibraryRuntime::Options f_opts;
-  f_opts.step_id = InstantiatedCapturedFunction::generate_step_id();
   ScopedStepContainer step_container(
       f_opts.step_id, [this](const string& name) {
         lib_->device()->resource_manager()->Cleanup(name).IgnoreError();
@@ -599,7 +597,6 @@ Status InstantiatedCapturedFunction::RunInstantiated(
   }
 
   FunctionLibraryRuntime::Options f_opts;
-  f_opts.step_id = InstantiatedCapturedFunction::generate_step_id();
   ScopedStepContainer step_container(
       f_opts.step_id, [this](const string& name) {
         lib_->device()->resource_manager()->Cleanup(name).IgnoreError();
@@ -655,7 +652,6 @@ void InstantiatedCapturedFunction::RunAsync(
       std::move(args), &captured_func_->captured_inputs(), ret_types_);
 
   FunctionLibraryRuntime::Options f_opts;
-  f_opts.step_id = InstantiatedCapturedFunction::generate_step_id();
   ResourceMgr* resource_mgr = lib_->device()->resource_manager();
   ScopedStepContainer* step_container = new ScopedStepContainer(
       f_opts.step_id, [resource_mgr](const string& name) {
diff --git a/tensorflow/core/kernels/data/captured_function.h b/tensorflow/core/kernels/data/captured_function.h
index 1f134888cb8..6a074d0719b 100644
--- a/tensorflow/core/kernels/data/captured_function.h
+++ b/tensorflow/core/kernels/data/captured_function.h
@@ -88,16 +88,6 @@ class InstantiatedCapturedFunction {
                 FunctionLibraryRuntime::DoneCallback done,
                 const string& prefix) const;
 
-  // Returns a step ID for use when running an `InstantiatedCapturedFunction`.
-  static int64 generate_step_id() {
-    // Choose a step ID that is guaranteed not to clash with any
-    // Session-generated step ID. DirectSession only generates
-    // non-negative step IDs (contiguous, starting from 0), and
-    // MasterSession generates 56-bit random step IDs whose MSB is
-    // always 0, so a negative random step ID should suffice.
-    return -std::abs(static_cast<int64>(random::New64()));
-  }
-
  private:
   InstantiatedCapturedFunction(
       FunctionLibraryRuntime* lib, FunctionLibraryRuntime::Handle f_handle,
diff --git a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
index eb547133609..cda0885e7f0 100644
--- a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
@@ -99,7 +99,7 @@ class AssertNextDatasetOp : public UnaryDatasetOpKernel {
 
       Status Initialize(IteratorContext* ctx) override {
         std::vector<string> tokens =
-            str_util::Split(prefix(), ':', str_util::SkipEmpty());
+            absl::StrSplit(prefix(), ':', absl::SkipEmpty());
         if (dataset()->transformations_.size() > tokens.size() - 2) {
           return errors::InvalidArgument(
               "Asserted next ", dataset()->transformations_.size(),
diff --git a/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc b/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
index 4435c2a1313..fecafaacf2d 100644
--- a/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
@@ -61,7 +61,7 @@ class CSVDatasetOp : public DatasetOpKernel {
     OP_REQUIRES(ctx, select_cols_tensor->dims() == 1,
                 errors::InvalidArgument("`select_cols` must be a vector."));
 
-    int64 buffer_size;
+    int64 buffer_size = 0;
     OP_REQUIRES_OK(
         ctx, ParseScalarArgument<int64>(ctx, "buffer_size", &buffer_size));
     OP_REQUIRES(ctx, buffer_size > 0,
diff --git a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
index 9453f2b1759..a7472a49e4a 100644
--- a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
@@ -61,13 +61,13 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
  protected:
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
-    int64 batch_size;
+    int64 batch_size = 0;
     OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "batch_size", &batch_size));
     OP_REQUIRES(
         ctx, batch_size > 0,
         errors::InvalidArgument("batch_size must be greater than zero."));
 
-    int64 num_parallel_calls;
+    int64 num_parallel_calls = 0;
     OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_calls",
                                             &num_parallel_calls));
     OP_REQUIRES(
@@ -207,6 +207,15 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         }
       }
 
+      string BuildTraceMeName() override {
+        int64 parallelism;
+        {
+          tf_shared_lock l(*mu_);
+          parallelism = num_parallel_calls_->value;
+        }
+        return strings::StrCat(prefix(), "#parallelism=", parallelism, "#");
+      }
+
       Status Initialize(IteratorContext* ctx) override {
         mutex_lock l(*mu_);
         if (num_parallel_calls_->value == model::kAutoTune) {
diff --git a/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc b/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
index 381b9691d14..6a8c9939a33 100644
--- a/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
@@ -308,7 +308,7 @@ class MatchingFilesDatasetOp : public DatasetOpKernel {
             const string child_path = io::JoinPath(current_dir, children[i]);
             // In case the child_path doesn't start with the fixed_prefix, then
             // we don't need to explore this path.
-            if (!str_util::StartsWith(child_path, fixed_prefix)) {
+            if (!absl::StartsWith(child_path, fixed_prefix)) {
               children_dir_status[i] =
                   errors::Cancelled("Operation not needed");
             } else {
diff --git a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
index 58c621587fe..3dbb4df8ada 100644
--- a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
@@ -72,7 +72,7 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
  protected:
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
-    int64 num_parallel_calls;
+    int64 num_parallel_calls = 0;
     OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_calls",
                                             &num_parallel_calls));
     OP_REQUIRES(ctx, num_parallel_calls > 0,
diff --git a/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
index c5851eaf86b..dec136dd35e 100644
--- a/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
@@ -267,7 +267,7 @@ class SlidingWindowDatasetOp : public UnaryDatasetOpKernel {
           input_impl_.reset();
         }
         // Restore buffer.
-        int64 buffer_size;
+        int64 buffer_size = 0;
         TF_RETURN_IF_ERROR(
             reader->ReadScalar(strings::StrCat("buffer_size"), &buffer_size));
         buffer_.resize(buffer_size);
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
index dca2ce72f9c..1ff5878bb65 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"  // NOLINT
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/io/compression.h"
 #include "tensorflow/core/lib/io/record_reader.h"
 #include "tensorflow/core/lib/io/record_writer.h"
 #include "tensorflow/core/lib/random/random.h"
@@ -35,6 +36,10 @@ namespace {
 
 enum SnapshotMode { READER = 0, WRITER = 1, PASSTHROUGH = 2 };
 
+const uint64 kReaderBufferSize = 8 * 1024 * 1024;  // 8 MB
+
+const char* kCompressionType = io::compression::kGzip;
+
 const uint64 kOneDayInMicroseconds = 24L * 60L * 60L * 1e6L;
 
 const uint64 kNumElementsPerShard = 10000;
@@ -75,9 +80,8 @@ Status ReadMetadataFile(const string& fingerprint_dir,
   TF_CHECK_OK(Env::Default()->NewRandomAccessFile(metadata_filename, &file));
 
   string record_bytes;
-  auto reader = absl::make_unique<io::RecordReader>(file.get());
-  uint64 offset = 0;
-  TF_CHECK_OK(reader->ReadRecord(&offset, &record_bytes));
+  auto reader = absl::make_unique<io::SequentialRecordReader>(file.get());
+  TF_CHECK_OK(reader->ReadRecord(&record_bytes));
 
   metadata->ParseFromString(record_bytes);
   return Status::OK();
@@ -265,7 +269,6 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
           if (current_read_filename_ != snapshot_data_filename) {
             current_reader_.reset();
             current_read_file_.reset();
-            current_read_offset_ = 0;
 
             // The current implementation here assumes that tensors are stored
             // in files which are named sequentially. If a file doesn't exist
@@ -279,14 +282,18 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
 
             TF_CHECK_OK(Env::Default()->NewRandomAccessFile(
                 snapshot_data_filename, &current_read_file_));
-            current_reader_ =
-                absl::make_unique<io::RecordReader>(current_read_file_.get());
+            auto reader_options =
+                io::RecordReaderOptions::CreateRecordReaderOptions(
+                    kCompressionType);
+            reader_options.buffer_size = kReaderBufferSize;
+
+            current_reader_ = absl::make_unique<io::SequentialRecordReader>(
+                current_read_file_.get(), reader_options);
             current_read_filename_ = snapshot_data_filename;
           }
 
           string record_bytes;
-          Status s =
-              current_reader_->ReadRecord(&current_read_offset_, &record_bytes);
+          Status s = current_reader_->ReadRecord(&record_bytes);
 
           if (errors::IsOutOfRange(s)) {
             *end_of_sequence = true;
@@ -295,6 +302,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
             return s;
           }
 
+          *end_of_sequence = false;
           experimental::SnapshotRecord record;
           record.ParseFromString(record_bytes);
 
@@ -319,9 +327,9 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
         std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
 
         string current_read_filename_ GUARDED_BY(mu_);
-        uint64 current_read_offset_ GUARDED_BY(mu_);
         std::unique_ptr<RandomAccessFile> current_read_file_ GUARDED_BY(mu_);
-        std::unique_ptr<io::RecordReader> current_reader_ GUARDED_BY(mu_);
+        std::unique_ptr<io::SequentialRecordReader> current_reader_
+            GUARDED_BY(mu_);
 
         int64 next_index_ GUARDED_BY(mu_) = 0;
 
@@ -396,10 +404,14 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
             current_writer_.reset();
             current_write_file_.reset();
 
+            auto writer_options =
+                io::RecordWriterOptions::CreateRecordWriterOptions(
+                    kCompressionType);
+
             TF_RETURN_IF_ERROR(Env::Default()->NewWritableFile(
                 snapshot_data_filename, &current_write_file_));
-            current_writer_ =
-                absl::make_unique<io::RecordWriter>(current_write_file_.get());
+            current_writer_ = absl::make_unique<io::RecordWriter>(
+                current_write_file_.get(), writer_options);
             current_write_filename_ = snapshot_data_filename;
           }
 
diff --git a/tensorflow/core/kernels/data/filter_by_component_dataset_op.cc b/tensorflow/core/kernels/data/filter_by_component_dataset_op.cc
index 3b9b319ea94..bbcc84db31b 100644
--- a/tensorflow/core/kernels/data/filter_by_component_dataset_op.cc
+++ b/tensorflow/core/kernels/data/filter_by_component_dataset_op.cc
@@ -130,7 +130,13 @@ class FilterByLastComponentDatasetOp : public UnaryDatasetOpKernel {
             return Status::OK();
           }
 
-          matched = out_tensors->back().scalar<bool>()();
+          const Tensor& last_component = out_tensors->back();
+          if (last_component.NumElements() != 1 ||
+              last_component.dtype() != DT_BOOL) {
+            return errors::InvalidArgument(
+                "Last component must be a bool scalar.");
+          }
+          matched = last_component.scalar<bool>()();
           out_tensors->pop_back();
           if (!matched) {
             // Clear the output tensor list since it didn't match.
diff --git a/tensorflow/core/kernels/data/filter_by_component_dataset_op_test.cc b/tensorflow/core/kernels/data/filter_by_component_dataset_op_test.cc
new file mode 100644
index 00000000000..04627dfae93
--- /dev/null
+++ b/tensorflow/core/kernels/data/filter_by_component_dataset_op_test.cc
@@ -0,0 +1,589 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kNodeName[] = "filter_by_last_component_dataset";
+constexpr char kOpName[] = "FilterByLastComponentDataset";
+
+class FilterByLastComponentDatasetOpTest : public DatasetOpsTestBase {
+ protected:
+  // Creates `TensorSliceDataset` variant tensor from the input vector of
+  // tensors.
+  Status CreateTensorSliceDatasetTensor(
+      std::vector<Tensor> *const tensor_vector, Tensor *dataset_tensor) {
+    DatasetBase *tensor_slice_dataset;
+    TF_RETURN_IF_ERROR(CreateTensorSliceDataset(
+        "tensor_slice_node", tensor_vector, &tensor_slice_dataset));
+    TF_RETURN_IF_ERROR(
+        StoreDatasetInVariantTensor(tensor_slice_dataset, dataset_tensor));
+    return Status::OK();
+  }
+
+  // Creates a new `FilterByLastComponentDataset` op kernel.
+  Status CreateFilterByLastComponentDatasetKernel(
+      const DataTypeVector &output_types,
+      const std::vector<PartialTensorShape> &output_shapes,
+      std::unique_ptr<OpKernel> *op_kernel) {
+    NodeDef node_def = test::function::NDef(
+        kNodeName, kOpName, {"input_dataset"},
+        {{"output_types", output_types}, {"output_shapes", output_shapes}});
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def, op_kernel));
+    return Status::OK();
+  }
+
+  // Creates a new `FilterByLastComponentDataset` op kernel context.
+  Status CreateFilterByLastComponentDatasetContext(
+      OpKernel *const op_kernel,
+      gtl::InlinedVector<TensorValue, 4> *const inputs,
+      std::unique_ptr<OpKernelContext> *context) {
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*op_kernel, *inputs));
+    TF_RETURN_IF_ERROR(CreateOpKernelContext(op_kernel, inputs, context));
+    return Status::OK();
+  }
+};
+
+struct TestCase {
+  std::vector<Tensor> input_tensors;
+  std::vector<Tensor> expected_outputs;
+  DataTypeVector expected_output_dtypes;
+  std::vector<PartialTensorShape> expected_output_shapes;
+  int64 expected_cardinality;
+  std::vector<int> breakpoints;
+};
+
+// Test case 1: simple case.
+TestCase TestCase1() {
+  return {/*input_tensors*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
+                                                   {0, 1, 2, 3, 4, 5}),
+           DatasetOpsTestBase::CreateTensor<bool>(TensorShape{3, 1},
+                                                  {true, false, true})},
+          /*expected_outputs*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {0, 1}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {4, 5})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({2})},
+          /*expected_cardinality*/ kUnknownCardinality,
+          /*breakpoints*/ {0, 1, 5}};
+}
+
+// Test case 2: the output of input dataset is empty.
+TestCase TestCase2() {
+  return {/*input_tensors*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{0}, {})},
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ kUnknownCardinality,
+          /*breakpoints*/ {0}};
+}
+
+// Test case 3: the output of input dataset has only one component.
+TestCase TestCase3() {
+  return {/*input_tensors*/
+          {DatasetOpsTestBase::CreateTensor<bool>(TensorShape{3, 1},
+                                                  {true, false, true})},
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_BOOL},
+          /*expected_output_shapes*/ {PartialTensorShape({1})},
+          /*expected_cardinality*/ kUnknownCardinality,
+          /*breakpoints*/ {0, 1, 5}};
+}
+
+// Test case 4: the last component has more than one element.
+TestCase InvalidLastComponentShape() {
+  return {/*input_tensors*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
+                                                   {0, 1, 2, 3, 4, 5}),
+           DatasetOpsTestBase::CreateTensor<bool>(
+               TensorShape{3, 2}, {true, false, true, true, false, true})},
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({2})},
+          /*expected_cardinality*/ kUnknownCardinality,
+          /*breakpoints*/ {}};
+}
+
+// Test case 5: the data type of last component is not DT_BOOL.
+TestCase InvalidLastComponentDType() {
+  return {/*input_tensors*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
+                                                   {0, 1, 2, 3, 4, 5}),
+           DatasetOpsTestBase::CreateTensor<int>(TensorShape{3}, {1, 1, 0})},
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({2})},
+          /*expected_cardinality*/ kUnknownCardinality,
+          /*breakpoints*/ {}};
+}
+
+class ParameterizedFilterByLastComponentDatasetOpTest
+    : public FilterByLastComponentDatasetOpTest,
+      public ::testing::WithParamInterface<TestCase> {};
+
+TEST_P(ParameterizedFilterByLastComponentDatasetOpTest, GetNext) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+
+  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
+      test_case.expected_output_dtypes, test_case.expected_output_shapes,
+      &filter_by_last_component_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
+      filter_by_last_component_dataset_kernel.get(), &inputs,
+      &filter_by_last_component_dataset_context));
+  DatasetBase *filter_by_last_component_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
+                             filter_by_last_component_dataset_context.get(),
+                             &filter_by_last_component_dataset));
+  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(CreateIteratorContext(
+      filter_by_last_component_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(filter_by_last_component_dataset->MakeIterator(
+      iterator_ctx.get(), "Iterator", &iterator));
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  while (!end_of_sequence) {
+    std::vector<Tensor> next;
+    TF_EXPECT_OK(
+        iterator->GetNext(iterator_ctx.get(), &next, &end_of_sequence));
+    out_tensors.insert(out_tensors.end(), next.begin(), next.end());
+  }
+
+  TF_EXPECT_OK(ExpectEqual(out_tensors, test_case.expected_outputs,
+                           /*compare_order*/ true));
+}
+
+TEST_F(FilterByLastComponentDatasetOpTest, DatasetNodeName) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = TestCase1();
+
+  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
+      test_case.expected_output_dtypes, test_case.expected_output_shapes,
+      &filter_by_last_component_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
+      filter_by_last_component_dataset_kernel.get(), &inputs,
+      &filter_by_last_component_dataset_context));
+  DatasetBase *filter_by_last_component_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
+                             filter_by_last_component_dataset_context.get(),
+                             &filter_by_last_component_dataset));
+  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
+
+  EXPECT_EQ(filter_by_last_component_dataset->node_name(), kNodeName);
+}
+
+TEST_F(FilterByLastComponentDatasetOpTest, DatasetTypeString) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = TestCase1();
+
+  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
+      test_case.expected_output_dtypes, test_case.expected_output_shapes,
+      &filter_by_last_component_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
+      filter_by_last_component_dataset_kernel.get(), &inputs,
+      &filter_by_last_component_dataset_context));
+  DatasetBase *filter_by_last_component_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
+                             filter_by_last_component_dataset_context.get(),
+                             &filter_by_last_component_dataset));
+  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
+
+  EXPECT_EQ(filter_by_last_component_dataset->type_string(), kOpName);
+}
+
+TEST_P(ParameterizedFilterByLastComponentDatasetOpTest, DatasetOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+
+  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
+      test_case.expected_output_dtypes, test_case.expected_output_shapes,
+      &filter_by_last_component_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
+      filter_by_last_component_dataset_kernel.get(), &inputs,
+      &filter_by_last_component_dataset_context));
+  DatasetBase *filter_by_last_component_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
+                             filter_by_last_component_dataset_context.get(),
+                             &filter_by_last_component_dataset));
+  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
+
+  TF_EXPECT_OK(
+      VerifyTypesMatch(filter_by_last_component_dataset->output_dtypes(),
+                       test_case.expected_output_dtypes));
+}
+
+TEST_P(ParameterizedFilterByLastComponentDatasetOpTest, DatasetOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+
+  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
+      test_case.expected_output_dtypes, test_case.expected_output_shapes,
+      &filter_by_last_component_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
+      filter_by_last_component_dataset_kernel.get(), &inputs,
+      &filter_by_last_component_dataset_context));
+  DatasetBase *filter_by_last_component_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
+                             filter_by_last_component_dataset_context.get(),
+                             &filter_by_last_component_dataset));
+  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
+
+  TF_EXPECT_OK(
+      VerifyShapesCompatible(filter_by_last_component_dataset->output_shapes(),
+                             test_case.expected_output_shapes));
+}
+
+TEST_P(ParameterizedFilterByLastComponentDatasetOpTest, Cardinality) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+
+  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
+      test_case.expected_output_dtypes, test_case.expected_output_shapes,
+      &filter_by_last_component_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
+      filter_by_last_component_dataset_kernel.get(), &inputs,
+      &filter_by_last_component_dataset_context));
+  DatasetBase *filter_by_last_component_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
+                             filter_by_last_component_dataset_context.get(),
+                             &filter_by_last_component_dataset));
+  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
+
+  EXPECT_EQ(filter_by_last_component_dataset->Cardinality(),
+            test_case.expected_cardinality);
+}
+
+TEST_P(ParameterizedFilterByLastComponentDatasetOpTest, DatasetSave) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+
+  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
+      test_case.expected_output_dtypes, test_case.expected_output_shapes,
+      &filter_by_last_component_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
+      filter_by_last_component_dataset_kernel.get(), &inputs,
+      &filter_by_last_component_dataset_context));
+  DatasetBase *filter_by_last_component_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
+                             filter_by_last_component_dataset_context.get(),
+                             &filter_by_last_component_dataset));
+  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
+
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(
+      filter_by_last_component_dataset->Save(serialization_ctx.get(), &writer));
+  TF_ASSERT_OK(writer.Flush());
+}
+
+TEST_P(ParameterizedFilterByLastComponentDatasetOpTest, IteratorOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+
+  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
+      test_case.expected_output_dtypes, test_case.expected_output_shapes,
+      &filter_by_last_component_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
+      filter_by_last_component_dataset_kernel.get(), &inputs,
+      &filter_by_last_component_dataset_context));
+  DatasetBase *filter_by_last_component_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
+                             filter_by_last_component_dataset_context.get(),
+                             &filter_by_last_component_dataset));
+  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(CreateIteratorContext(
+      filter_by_last_component_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(filter_by_last_component_dataset->MakeIterator(
+      iterator_ctx.get(), "Iterator", &iterator));
+
+  TF_EXPECT_OK(VerifyTypesMatch(iterator->output_dtypes(),
+                                test_case.expected_output_dtypes));
+}
+
+TEST_P(ParameterizedFilterByLastComponentDatasetOpTest, IteratorOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+
+  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
+      test_case.expected_output_dtypes, test_case.expected_output_shapes,
+      &filter_by_last_component_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
+      filter_by_last_component_dataset_kernel.get(), &inputs,
+      &filter_by_last_component_dataset_context));
+  DatasetBase *filter_by_last_component_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
+                             filter_by_last_component_dataset_context.get(),
+                             &filter_by_last_component_dataset));
+  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(CreateIteratorContext(
+      filter_by_last_component_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(filter_by_last_component_dataset->MakeIterator(
+      iterator_ctx.get(), "Iterator", &iterator));
+
+  TF_EXPECT_OK(VerifyShapesCompatible(iterator->output_shapes(),
+                                      test_case.expected_output_shapes));
+}
+
+TEST_F(FilterByLastComponentDatasetOpTest, IteratorOutputPrefix) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = TestCase1();
+
+  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
+      test_case.expected_output_dtypes, test_case.expected_output_shapes,
+      &filter_by_last_component_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
+      filter_by_last_component_dataset_kernel.get(), &inputs,
+      &filter_by_last_component_dataset_context));
+  DatasetBase *filter_by_last_component_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
+                             filter_by_last_component_dataset_context.get(),
+                             &filter_by_last_component_dataset));
+  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(CreateIteratorContext(
+      filter_by_last_component_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(filter_by_last_component_dataset->MakeIterator(
+      iterator_ctx.get(), "Iterator", &iterator));
+
+  EXPECT_EQ(iterator->prefix(), "Iterator::FilterByLastComponent");
+}
+
+TEST_P(ParameterizedFilterByLastComponentDatasetOpTest, Roundtrip) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+
+  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
+      test_case.expected_output_dtypes, test_case.expected_output_shapes,
+      &filter_by_last_component_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
+  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
+      filter_by_last_component_dataset_kernel.get(), &inputs,
+      &filter_by_last_component_dataset_context));
+  DatasetBase *filter_by_last_component_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
+                             filter_by_last_component_dataset_context.get(),
+                             &filter_by_last_component_dataset));
+  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(CreateIteratorContext(
+      filter_by_last_component_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(filter_by_last_component_dataset->MakeIterator(
+      iterator_ctx.get(), "Iterator", &iterator));
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  int cur_iteration = 0;
+  const std::vector<int> &breakpoints = test_case.breakpoints;
+  for (int breakpoint : breakpoints) {
+    VariantTensorData data;
+    VariantTensorDataWriter writer(&data);
+    TF_EXPECT_OK(iterator->Save(serialization_ctx.get(), &writer));
+    TF_EXPECT_OK(writer.Flush());
+    VariantTensorDataReader reader(&data);
+    TF_EXPECT_OK(RestoreIterator(iterator_ctx.get(), &reader, "Iterator",
+                                 *filter_by_last_component_dataset, &iterator));
+
+    while (cur_iteration <= breakpoint) {
+      std::vector<Tensor> next;
+      TF_EXPECT_OK(
+          iterator->GetNext(iterator_ctx.get(), &next, &end_of_sequence));
+      out_tensors.insert(out_tensors.end(), next.begin(), next.end());
+      cur_iteration++;
+    }
+  }
+
+  TF_EXPECT_OK(ExpectEqual(out_tensors, test_case.expected_outputs,
+                           /*compare_order*/ true));
+}
+
+INSTANTIATE_TEST_SUITE_P(FilterByLastComponentDatasetOpTest,
+                         ParameterizedFilterByLastComponentDatasetOpTest,
+                         ::testing::ValuesIn(std::vector<TestCase>(
+                             {TestCase1(), TestCase2(), TestCase3()})));
+
+TEST_F(FilterByLastComponentDatasetOpTest, InvalidLastComponent) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  std::vector<TestCase> test_cases = {InvalidLastComponentShape(),
+                                      InvalidLastComponentDType()};
+  for (const TestCase &test_case : test_cases) {
+    std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
+    TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
+        test_case.expected_output_dtypes, test_case.expected_output_shapes,
+        &filter_by_last_component_dataset_kernel));
+
+    Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+    std::vector<Tensor> inputs_for_tensor_slice_dataset =
+        test_case.input_tensors;
+    TF_ASSERT_OK(CreateTensorSliceDatasetTensor(
+        &inputs_for_tensor_slice_dataset, &tensor_slice_dataset_tensor));
+    gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+    std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
+    TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
+        filter_by_last_component_dataset_kernel.get(), &inputs,
+        &filter_by_last_component_dataset_context));
+    DatasetBase *filter_by_last_component_dataset;
+    TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
+                               filter_by_last_component_dataset_context.get(),
+                               &filter_by_last_component_dataset));
+    core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
+
+    std::unique_ptr<IteratorContext> iterator_ctx;
+    TF_ASSERT_OK(CreateIteratorContext(
+        filter_by_last_component_dataset_context.get(), &iterator_ctx));
+    std::unique_ptr<IteratorBase> iterator;
+    TF_ASSERT_OK(filter_by_last_component_dataset->MakeIterator(
+        iterator_ctx.get(), "Iterator", &iterator));
+
+    std::vector<Tensor> next;
+    bool end_of_sequence = false;
+    EXPECT_EQ(
+        iterator->GetNext(iterator_ctx.get(), &next, &end_of_sequence).code(),
+        tensorflow::error::INVALID_ARGUMENT);
+  }
+}
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/filter_dataset_op.cc b/tensorflow/core/kernels/data/filter_dataset_op.cc
index f707ada623d..688d120ba8e 100644
--- a/tensorflow/core/kernels/data/filter_dataset_op.cc
+++ b/tensorflow/core/kernels/data/filter_dataset_op.cc
@@ -149,6 +149,9 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
 
           if (result.size() != 1 || result[0].dtype() != DT_BOOL ||
               result[0].NumElements() != 1) {
+            // Clear the output tensor list since there were errors with Filter
+            // prediction result.
+            out_tensors->clear();
             return errors::InvalidArgument(
                 "Filter predicate `f` must return a scalar bool.");
           }
diff --git a/tensorflow/core/kernels/data/filter_dataset_op_test.cc b/tensorflow/core/kernels/data/filter_dataset_op_test.cc
new file mode 100644
index 00000000000..b145600b833
--- /dev/null
+++ b/tensorflow/core/kernels/data/filter_dataset_op_test.cc
@@ -0,0 +1,593 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kNodeName[] = "filter_dataset";
+constexpr char kOpName[] = "FilterDataset";
+
+class FilterDatasetOpTest : public DatasetOpsTestBase {
+ protected:
+  // Creates `TensorSliceDataset` variant tensor from the input vector of
+  // tensors.
+  Status CreateTensorSliceDatasetTensor(
+      std::vector<Tensor> *const tensor_vector, Tensor *dataset_tensor) {
+    DatasetBase *tensor_slice_dataset;
+    TF_RETURN_IF_ERROR(CreateTensorSliceDataset(
+        "tensor_slice_node", tensor_vector, &tensor_slice_dataset));
+    TF_RETURN_IF_ERROR(
+        StoreDatasetInVariantTensor(tensor_slice_dataset, dataset_tensor));
+    return Status::OK();
+  }
+
+  // Creates a new `FilterDataset` op kernel
+  Status CreateFilterDatasetKernel(
+      const FunctionDefHelper::AttrValueWrapper &func,
+      const DataTypeVector &output_types,
+      const std::vector<PartialTensorShape> &output_shapes,
+      std::unique_ptr<OpKernel> *op_kernel) {
+    NodeDef node_def =
+        test::function::NDef(kNodeName, kOpName, {"input_dataset"},
+                             {{"predicate", func},
+                              {"Targuments", {}},
+                              {"output_types", output_types},
+                              {"output_shapes", output_shapes}});
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def, op_kernel));
+    return Status::OK();
+  }
+
+  // Creates a new `ParallelInterleaveDataset` op kernel context.
+  Status CreateFilterDatasetContext(
+      OpKernel *const op_kernel,
+      gtl::InlinedVector<TensorValue, 4> *const inputs,
+      std::unique_ptr<OpKernelContext> *context) {
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*op_kernel, *inputs));
+    TF_RETURN_IF_ERROR(CreateOpKernelContext(op_kernel, inputs, context));
+    return Status::OK();
+  }
+};
+
+struct TestCase {
+  std::vector<Tensor> input_tensors;
+  FunctionDefHelper::AttrValueWrapper func;
+  std::vector<FunctionDef> func_lib;
+  std::vector<Tensor> expected_outputs;
+  DataTypeVector expected_output_dtypes;
+  std::vector<PartialTensorShape> expected_output_shapes;
+  int64 expected_cardinality;
+  std::vector<int> breakpoints;
+};
+
+template <typename T>
+std::vector<Tensor> ConvertToTensorVec(std::vector<T> values) {
+  std::vector<Tensor> tensors;
+  tensors.reserve(values.size());
+  for (auto &value : values) {
+    tensors.emplace_back(
+        DatasetOpsTestBase::CreateTensor<T>(TensorShape({1}), {value}));
+  }
+  return tensors;
+}
+
+// Test case 1: norm case.
+TestCase TestCase1() {
+  return {/*input_tensors*/
+          {DatasetOpsTestBase::CreateTensor<int64>(
+              TensorShape{9, 1}, {0, 0, 0, 3, 4, 5, 6, 7, 8})},
+          /*func*/ FunctionDefHelper::FunctionRef("IsZero", {{"T", DT_INT64}}),
+          /*func_lib*/ {test::function::IsZero()},
+          /*expected_outputs*/
+          ConvertToTensorVec<int64>({0, 0, 0}),
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({1})},
+          /*expected_cardinality*/ kUnknownCardinality,
+          /*breakpoints*/ {0, 2, 6}};
+}
+
+// Test case 2: the input dataset has no outputs.
+TestCase TestCase2() {
+  return {/*input_tensors*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{0}, {})},
+          /*func*/ FunctionDefHelper::FunctionRef("IsZero", {{"T", DT_INT64}}),
+          /*func_lib*/ {test::function::IsZero()},
+          /*expected_outputs*/
+          ConvertToTensorVec<int64>({}),
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ kUnknownCardinality,
+          /*breakpoints*/ {0, 2, 6}};
+}
+
+// Test case 3: the filter function returns two outputs.
+TestCase InvalidFuncTestCase1() {
+  return {/*input_tensors*/
+          {DatasetOpsTestBase::CreateTensor<int64>(
+              TensorShape{3, 3}, {0, 0, 0, 3, 4, 5, 6, 7, 8})},
+          /*func*/
+          FunctionDefHelper::FunctionRef(
+              "GetUnique", {{"T", DT_INT64}, {"out_idx", DT_INT32}}),
+          /*func_lib*/ {test::function::Unique()},
+          /*expected_outputs*/
+          ConvertToTensorVec<int64>({}),
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({3, 1})},
+          /*expected_cardinality*/ kUnknownCardinality,
+          /*breakpoints*/ {}};
+}
+
+// Test case 4: the filter function returns a 1-D bool tensor.
+TestCase InvalidFuncTestCase2() {
+  return {/*input_tensors*/
+          {DatasetOpsTestBase::CreateTensor<int64>(
+              TensorShape{3, 3, 1}, {0, 0, 0, 3, 4, 5, 6, 7, 8})},
+          /*func*/ FunctionDefHelper::FunctionRef("IsZero", {{"T", DT_INT64}}),
+          /*func_lib*/ {test::function::IsZero()},
+          /*expected_outputs*/
+          ConvertToTensorVec<int64>({}),
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({3, 1})},
+          /*expected_cardinality*/ kUnknownCardinality,
+          /*breakpoints*/ {}};
+}
+
+// Test case 5: the filter function returns a scalar int64 tensor.
+TestCase InvalidFuncTestCase3() {
+  return {/*input_tensors*/
+          {DatasetOpsTestBase::CreateTensor<int64>(
+              TensorShape{9}, {0, 0, 0, 3, 4, 5, 6, 7, 8})},
+          /*func*/ FunctionDefHelper::FunctionRef("NonZero", {{"T", DT_INT64}}),
+          /*func_lib*/ {test::function::NonZero()},
+          /*expected_outputs*/
+          ConvertToTensorVec<int64>({}),
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ kUnknownCardinality,
+          /*breakpoints*/ {}};
+}
+
+class ParameterizedFilterDatasetOpTest
+    : public FilterDatasetOpTest,
+      public ::testing::WithParamInterface<TestCase> {};
+
+TEST_P(ParameterizedFilterDatasetOpTest, GetNext) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> filter_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &filter_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_dataset_context;
+  TF_ASSERT_OK(CreateFilterDatasetContext(filter_dataset_kernel.get(), &inputs,
+                                          &filter_dataset_context));
+  DatasetBase *filter_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_dataset_kernel.get(),
+                             filter_dataset_context.get(), &filter_dataset));
+  core::ScopedUnref scoped_unref(filter_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(filter_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      filter_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  while (!end_of_sequence) {
+    std::vector<Tensor> next;
+    TF_EXPECT_OK(
+        iterator->GetNext(iterator_ctx.get(), &next, &end_of_sequence));
+    out_tensors.insert(out_tensors.end(), next.begin(), next.end());
+  }
+
+  TF_EXPECT_OK(ExpectEqual(out_tensors, test_case.expected_outputs,
+                           /*compare_order*/ true));
+}
+
+TEST_F(FilterDatasetOpTest, DatasetNodeName) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = TestCase1();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> filter_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &filter_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_dataset_context;
+  TF_ASSERT_OK(CreateFilterDatasetContext(filter_dataset_kernel.get(), &inputs,
+                                          &filter_dataset_context));
+  DatasetBase *filter_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_dataset_kernel.get(),
+                             filter_dataset_context.get(), &filter_dataset));
+  core::ScopedUnref scoped_unref(filter_dataset);
+
+  EXPECT_EQ(filter_dataset->node_name(), kNodeName);
+}
+
+TEST_F(FilterDatasetOpTest, DatasetTypeString) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = TestCase1();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> filter_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &filter_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_dataset_context;
+  TF_ASSERT_OK(CreateFilterDatasetContext(filter_dataset_kernel.get(), &inputs,
+                                          &filter_dataset_context));
+  DatasetBase *filter_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_dataset_kernel.get(),
+                             filter_dataset_context.get(), &filter_dataset));
+  core::ScopedUnref scoped_unref(filter_dataset);
+
+  EXPECT_EQ(filter_dataset->type_string(), kOpName);
+}
+
+TEST_P(ParameterizedFilterDatasetOpTest, DatasetOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> filter_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &filter_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_dataset_context;
+  TF_ASSERT_OK(CreateFilterDatasetContext(filter_dataset_kernel.get(), &inputs,
+                                          &filter_dataset_context));
+  DatasetBase *filter_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_dataset_kernel.get(),
+                             filter_dataset_context.get(), &filter_dataset));
+  core::ScopedUnref scoped_unref(filter_dataset);
+
+  TF_EXPECT_OK(VerifyTypesMatch(filter_dataset->output_dtypes(),
+                                test_case.expected_output_dtypes));
+}
+
+TEST_P(ParameterizedFilterDatasetOpTest, DatasetOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> filter_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &filter_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_dataset_context;
+  TF_ASSERT_OK(CreateFilterDatasetContext(filter_dataset_kernel.get(), &inputs,
+                                          &filter_dataset_context));
+  DatasetBase *filter_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_dataset_kernel.get(),
+                             filter_dataset_context.get(), &filter_dataset));
+  core::ScopedUnref scoped_unref(filter_dataset);
+
+  TF_EXPECT_OK(VerifyShapesCompatible(filter_dataset->output_shapes(),
+                                      test_case.expected_output_shapes));
+}
+
+TEST_P(ParameterizedFilterDatasetOpTest, Cardinality) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> filter_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &filter_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_dataset_context;
+  TF_ASSERT_OK(CreateFilterDatasetContext(filter_dataset_kernel.get(), &inputs,
+                                          &filter_dataset_context));
+  DatasetBase *filter_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_dataset_kernel.get(),
+                             filter_dataset_context.get(), &filter_dataset));
+  core::ScopedUnref scoped_unref(filter_dataset);
+
+  EXPECT_EQ(filter_dataset->Cardinality(), test_case.expected_cardinality);
+}
+
+TEST_P(ParameterizedFilterDatasetOpTest, DatasetSave) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> filter_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &filter_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_dataset_context;
+  TF_ASSERT_OK(CreateFilterDatasetContext(filter_dataset_kernel.get(), &inputs,
+                                          &filter_dataset_context));
+  DatasetBase *filter_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_dataset_kernel.get(),
+                             filter_dataset_context.get(), &filter_dataset));
+  core::ScopedUnref scoped_unref(filter_dataset);
+
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(filter_dataset->Save(serialization_ctx.get(), &writer));
+  TF_ASSERT_OK(writer.Flush());
+}
+
+TEST_P(ParameterizedFilterDatasetOpTest, IteratorOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> filter_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &filter_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_dataset_context;
+  TF_ASSERT_OK(CreateFilterDatasetContext(filter_dataset_kernel.get(), &inputs,
+                                          &filter_dataset_context));
+  DatasetBase *filter_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_dataset_kernel.get(),
+                             filter_dataset_context.get(), &filter_dataset));
+  core::ScopedUnref scoped_unref(filter_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(filter_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      filter_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  TF_EXPECT_OK(VerifyTypesMatch(iterator->output_dtypes(),
+                                test_case.expected_output_dtypes));
+}
+
+TEST_P(ParameterizedFilterDatasetOpTest, IteratorOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> filter_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &filter_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_dataset_context;
+  TF_ASSERT_OK(CreateFilterDatasetContext(filter_dataset_kernel.get(), &inputs,
+                                          &filter_dataset_context));
+  DatasetBase *filter_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_dataset_kernel.get(),
+                             filter_dataset_context.get(), &filter_dataset));
+  core::ScopedUnref scoped_unref(filter_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(filter_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      filter_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  TF_EXPECT_OK(VerifyShapesCompatible(iterator->output_shapes(),
+                                      test_case.expected_output_shapes));
+}
+
+TEST_F(ParameterizedFilterDatasetOpTest, IteratorOutputPrefix) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = TestCase1();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> filter_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &filter_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_dataset_context;
+  TF_ASSERT_OK(CreateFilterDatasetContext(filter_dataset_kernel.get(), &inputs,
+                                          &filter_dataset_context));
+  DatasetBase *filter_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_dataset_kernel.get(),
+                             filter_dataset_context.get(), &filter_dataset));
+  core::ScopedUnref scoped_unref(filter_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(filter_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      filter_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  EXPECT_EQ(iterator->prefix(), "Iterator::Filter");
+}
+
+TEST_P(ParameterizedFilterDatasetOpTest, Roundtrip) {
+  int thread_num = 2, cpu_num = 2;
+  const TestCase &test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> filter_dataset_kernel;
+  TF_ASSERT_OK(CreateFilterDatasetKernel(
+      test_case.func, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &filter_dataset_kernel));
+
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+  std::unique_ptr<OpKernelContext> filter_dataset_context;
+  TF_ASSERT_OK(CreateFilterDatasetContext(filter_dataset_kernel.get(), &inputs,
+                                          &filter_dataset_context));
+  DatasetBase *filter_dataset;
+  TF_ASSERT_OK(CreateDataset(filter_dataset_kernel.get(),
+                             filter_dataset_context.get(), &filter_dataset));
+  core::ScopedUnref scoped_unref(filter_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(filter_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      filter_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  int cur_iteration = 0;
+  const std::vector<int> &breakpoints = test_case.breakpoints;
+  for (int breakpoint : breakpoints) {
+    VariantTensorData data;
+    VariantTensorDataWriter writer(&data);
+    TF_EXPECT_OK(iterator->Save(serialization_ctx.get(), &writer));
+    TF_EXPECT_OK(writer.Flush());
+    VariantTensorDataReader reader(&data);
+    TF_EXPECT_OK(RestoreIterator(iterator_ctx.get(), &reader, "Iterator",
+                                 *filter_dataset, &iterator));
+
+    while (cur_iteration <= breakpoint) {
+      std::vector<Tensor> next;
+      TF_EXPECT_OK(
+          iterator->GetNext(iterator_ctx.get(), &next, &end_of_sequence));
+      out_tensors.insert(out_tensors.end(), next.begin(), next.end());
+      cur_iteration++;
+    }
+  }
+
+  TF_EXPECT_OK(ExpectEqual(out_tensors, test_case.expected_outputs,
+                           /*compare_order*/ true));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    FilterDatasetOpTest, ParameterizedFilterDatasetOpTest,
+    ::testing::ValuesIn(std::vector<TestCase>({TestCase1(), TestCase2()})));
+
+TEST_F(ParameterizedFilterDatasetOpTest, InvalidFuncs) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(
+      {test::function::IsZero(), test::function::Unique(),
+       test::function::NonZero()},
+      cpu_num));
+
+  std::vector<TestCase> test_cases(
+      {InvalidFuncTestCase1(), InvalidFuncTestCase2(), InvalidFuncTestCase3()});
+  for (const auto &test_case : test_cases) {
+    std::unique_ptr<OpKernel> filter_dataset_kernel;
+    TF_ASSERT_OK(CreateFilterDatasetKernel(
+        test_case.func, test_case.expected_output_dtypes,
+        test_case.expected_output_shapes, &filter_dataset_kernel));
+    Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+    std::vector<Tensor> inputs_for_tensor_slice_dataset =
+        test_case.input_tensors;
+    TF_ASSERT_OK(CreateTensorSliceDatasetTensor(
+        &inputs_for_tensor_slice_dataset, &tensor_slice_dataset_tensor));
+    gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
+    std::unique_ptr<OpKernelContext> filter_dataset_context;
+    TF_ASSERT_OK(CreateFilterDatasetContext(filter_dataset_kernel.get(),
+                                            &inputs, &filter_dataset_context));
+    DatasetBase *filter_dataset;
+    TF_ASSERT_OK(CreateDataset(filter_dataset_kernel.get(),
+                               filter_dataset_context.get(), &filter_dataset));
+    core::ScopedUnref scoped_unref(filter_dataset);
+
+    std::unique_ptr<IteratorContext> iterator_ctx;
+    TF_ASSERT_OK(
+        CreateIteratorContext(filter_dataset_context.get(), &iterator_ctx));
+    std::unique_ptr<IteratorBase> iterator;
+    TF_ASSERT_OK(filter_dataset->MakeIterator(iterator_ctx.get(), "Iterator",
+                                              &iterator));
+
+    bool end_of_sequence = false;
+    std::vector<Tensor> out_tensors;
+    EXPECT_EQ(
+        iterator->GetNext(iterator_ctx.get(), &out_tensors, &end_of_sequence)
+            .code(),
+        tensorflow::error::INVALID_ARGUMENT);
+    EXPECT_TRUE(out_tensors.empty());
+  }
+}
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index f928af8421d..8a37dacd6ca 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -237,6 +237,8 @@ class IteratorResource : public ResourceBase {
   // destroyed, essentially triggering the iterator deletion.
   class Deleter {
    public:
+    Deleter() : deleter_() {}
+
     Deleter(ResourceHandle handle, ResourceMgr* resource_manager)
         : deleter_(std::make_shared<Helper>(handle, resource_manager)) {}
 
@@ -248,6 +250,10 @@ class IteratorResource : public ResourceBase {
       VLOG(3) << "IteratorResource::Deleter copy constructor called.";
     }
 
+    Deleter& operator=(const Deleter& rhs) = delete;
+
+    Deleter& operator=(Deleter&& rhs) = default;
+
     virtual ~Deleter() {
       VLOG(3) << "IteratorResource::Deleter destructor called.";
     }
@@ -358,6 +364,9 @@ class IteratorStateVariant {
       Decode(*other.data_);
     }
   }
+  IteratorStateVariant& operator=(IteratorStateVariant&& other) = default;
+  IteratorStateVariant& operator=(const IteratorStateVariant& other) = delete;
+
   // Initializes this object with the current state of the iterator so
   // that it can be written on the next call to Encode().
   Status InitializeFromIterator(OpKernelContext* ctx,
@@ -916,12 +925,6 @@ class OneShotIteratorOp : public AsyncOpKernel {
         &f_handle));
     FunctionLibraryRuntime::Options opts;
     opts.cancellation_manager = ctx->cancellation_manager();
-    // Choose a step ID that is guaranteed not to clash with any
-    // Session-generated step ID. DirectSession only generates
-    // non-negative step IDs (contiguous, starting from 0), and
-    // MasterSession generates 56-bit random step IDs whose MSB is
-    // always 0, so a negative random step ID should suffice.
-    opts.step_id = -std::abs(static_cast<int64>(random::New64()));
     ScopedStepContainer step_container(opts.step_id, [ctx](const string& name) {
       ctx->resource_manager()->Cleanup(name).IgnoreError();
     });
@@ -1221,8 +1224,9 @@ REGISTER_KERNEL_BUILDER(
     MakeIteratorOp);
 REGISTER_KERNEL_BUILDER(Name("DeleteIterator").Device(DEVICE_CPU).Priority(2),
                         DeleteIteratorOp);
-REGISTER_KERNEL_BUILDER(Name("DeleteIterator").Device(DEVICE_GPU).Priority(1),
-                        DeleteIteratorOp);
+REGISTER_KERNEL_BUILDER(
+    Name("DeleteIterator").Device(DEVICE_GPU).HostMemory("deleter").Priority(1),
+    DeleteIteratorOp);
 REGISTER_KERNEL_BUILDER(
     Name("AnonymousIterator").Device(DEVICE_CPU).Priority(2),
     AnonymousIteratorHandleOp);
diff --git a/tensorflow/core/kernels/data/map_defun_op.cc b/tensorflow/core/kernels/data/map_defun_op.cc
index 1577e770d31..cae0facfba3 100644
--- a/tensorflow/core/kernels/data/map_defun_op.cc
+++ b/tensorflow/core/kernels/data/map_defun_op.cc
@@ -250,7 +250,6 @@ class MapDefunOp : public AsyncOpKernel {
   void SetRunOptions(OpKernelContext* ctx,
                      FunctionLibraryRuntime::Options* opts,
                      ComputeOptions* compute_opts, bool always_collect_stats) {
-    opts->step_id = ctx->step_id();
     opts->rendezvous = ctx->rendezvous();
     if (always_collect_stats) {
       opts->stats_collector = ctx->stats_collector();
diff --git a/tensorflow/core/kernels/data/optional_ops.h b/tensorflow/core/kernels/data/optional_ops.h
index 24eb1b81d90..91fa253b70d 100644
--- a/tensorflow/core/kernels/data/optional_ops.h
+++ b/tensorflow/core/kernels/data/optional_ops.h
@@ -90,10 +90,10 @@ class OptionalVariant {
   string DebugString() const {
     if (values_) {
       return strings::StrCat("OptionalVariant<", "values: (",
-                             str_util::Join(*values_, ", ",
-                                            [](string* s, const Tensor& elem) {
-                                              *s = elem.DebugString();
-                                            }),
+                             absl::StrJoin(*values_, ", ",
+                                           [](string* s, const Tensor& elem) {
+                                             *s = elem.DebugString();
+                                           }),
                              ")>");
     } else {
       return strings::StrCat("OptionalVariant<None>");
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index 666eac3814c..835b2387c1e 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -30,6 +30,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/cpu_info.h"
 
 namespace tensorflow {
@@ -75,7 +77,7 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES(ctx, block_length > 0,
                 errors::InvalidArgument("`block_length` must be > 0"));
 
-    int64 num_parallel_calls;
+    int64 num_parallel_calls = 0;
     OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_calls",
                                             &num_parallel_calls));
     OP_REQUIRES(
@@ -210,6 +212,15 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
         }
       }
 
+      string BuildTraceMeName() override {
+        int64 parallelism;
+        {
+          tf_shared_lock l(*mu_);
+          parallelism = num_parallel_calls_->value;
+        }
+        return strings::StrCat(prefix(), "#parallelism=", parallelism, "#");
+      }
+
       Status Initialize(IteratorContext* ctx) override {
         mutex_lock l(*mu_);
         if (num_parallel_calls_->value == model::kAutoTune) {
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.cc b/tensorflow/core/kernels/data/parallel_map_iterator.cc
index 9194c60f6c2..52befecb12e 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.cc
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/kernels/data/stats_utils.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/cpu_info.h"
 
 namespace tensorflow {
@@ -72,6 +73,15 @@ class ParallelMapIterator : public DatasetBaseIterator {
     }
   }
 
+  string BuildTraceMeName() override {
+    int64 parallelism;
+    {
+      tf_shared_lock l(*mu_);
+      parallelism = num_parallel_calls_->value;
+    }
+    return strings::StrCat(prefix(), "#parallelism=", parallelism, "#");
+  }
+
   Status Initialize(IteratorContext* ctx) override {
     mutex_lock l(*mu_);
     if (num_parallel_calls_->value == model::kAutoTune) {
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index eeb6d3005bd..e356044492c 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 
 namespace tensorflow {
 namespace data {
@@ -31,14 +32,18 @@ namespace data {
 // See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
+// Determines the fraction of slack time by which to delay prefetching of data.
+constexpr double kSleepFactor = 0.2;
 constexpr char kDatasetName[] = "Prefetch";
 
 class PrefetchDatasetOp::Dataset : public DatasetBase {
  public:
-  Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 buffer_size)
+  Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 buffer_size,
+          int64 slack_period)
       : DatasetBase(DatasetContext(ctx)),
         input_(input),
-        buffer_size_(buffer_size) {
+        buffer_size_(buffer_size),
+        slack_period_(slack_period) {
     input_->Ref();
   }
 
@@ -70,8 +75,11 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
     TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
     Node* buffer_size = nullptr;
     TF_RETURN_IF_ERROR(b->AddScalar(buffer_size_, &buffer_size));
-    TF_RETURN_IF_ERROR(
-        b->AddDataset(this, {input_graph_node, buffer_size}, output));
+    AttrValue slack_period_attr;
+    b->BuildAttrValue(slack_period_, &slack_period_attr);
+    TF_RETURN_IF_ERROR(b->AddDataset(
+        this, {input_graph_node, buffer_size},
+        {std::make_pair("slack_period", slack_period_attr)}, output));
     return Status::OK();
   }
 
@@ -80,7 +88,9 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
    public:
     explicit Iterator(const Params& params)
         : DatasetIterator<Dataset>(params),
-          auto_tuner_(params.dataset->buffer_size_) {}
+          auto_tuner_(params.dataset->buffer_size_) {
+      slack_us_ = 0;
+    }
 
     ~Iterator() override {
       // Signal the prefetch thread to terminate it. We will then
@@ -98,6 +108,15 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
       }
     }
 
+    string BuildTraceMeName() override {
+      int64 buffer_limit;
+      {
+        tf_shared_lock l(mu_);
+        buffer_limit = auto_tuner_.buffer_limit();
+      }
+      return strings::StrCat(prefix(), "#buffer_limit=", buffer_limit, "#");
+    }
+
     Status Initialize(IteratorContext* ctx) override {
       return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
     }
@@ -226,6 +245,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
       Status status;
       // The buffered data element.
       std::vector<Tensor> value;
+      int64 created_us;
     };
 
     Status Consume(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
@@ -248,6 +268,20 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
       // (if we successfully got an element) the output values.
       Status s = buffer_.front().status;
       if (s.ok()) {
+        if (dataset()->slack_period_ > 0 &&
+            (num_elements() + 1) % dataset()->slack_period_ == 0) {
+          // TODO(rachelim): Consider doing something more sophisticated
+          // to decide how long to sleep for; e.g. using a kalman filter.
+          int64 slack_us =
+              Env::Default()->NowMicros() - buffer_.front().created_us;
+          // Every slack_period_-th element, update the most recent slack time,
+          // measured by the duration between when the element is prefetched
+          // and when it is consumed. We add kSleepFactor * slack_us_ to the
+          // measurement because we slept for that duration before prefetching
+          // the element.
+          slack_us_ = kSleepFactor * slack_us_ + slack_us;
+          VLOG(2) << "Setting slack_us_: " << slack_us_;
+        }
         *out_tensors = std::move(buffer_.front().value);
         RecordBufferDequeue(ctx, *out_tensors);
       }
@@ -282,6 +316,8 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
     void PrefetchThread(const std::shared_ptr<IteratorContext>& ctx) {
       RecordStart(ctx.get());
       auto cleanup = gtl::MakeCleanup([this, ctx] { RecordStop(ctx.get()); });
+      // Keep track of where we are in an iteration "burst"
+      int num_produced = 0;
       while (true) {
         // 1. Wait for a slot in the buffer.
         {
@@ -297,6 +333,14 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
           }
         }
 
+        if (dataset()->slack_period_ > 0 &&
+            num_produced % dataset()->slack_period_ == 0) {
+          // For the first element in the "burst", sleep for a bit if there is
+          // slack.
+          VLOG(2) << "Sleeping for: " << slack_us_ * kSleepFactor;
+          ctx->env()->SleepForMicroseconds(slack_us_ * kSleepFactor);
+        }
+
         // 2. Read the next element.
         // Acquire the parent lock since we will be reading an element
         // from the input iterator. Note that we do not wish to release
@@ -319,9 +363,11 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
         {
           mutex_lock l(mu_);
           RecordBufferEnqueue(ctx.get(), buffer_element.value);
+          buffer_element.created_us = ctx->env()->NowMicros();
           buffer_.push_back(std::move(buffer_element));
           cond_var_.notify_all();
         }
+        ++num_produced;
       }
     }
 
@@ -375,14 +421,20 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
     std::unique_ptr<Thread> prefetch_thread_ GUARDED_BY(mu_);
     bool cancelled_ GUARDED_BY(mu_) = false;
     bool prefetch_thread_finished_ GUARDED_BY(mu_) = false;
+
+    std::atomic<int64> slack_us_;
   };
   const DatasetBase* const input_;
   const int64 buffer_size_;
+
+  // If non-zero, determines the period between injecting "slack" into the
+  // execution.
+  const int64 slack_period_;
 };
 
 void PrefetchDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                                     DatasetBase** output) {
-  int64 buffer_size;
+  int64 buffer_size = 0;
   OP_REQUIRES_OK(ctx,
                  ParseScalarArgument<int64>(ctx, "buffer_size", &buffer_size));
   OP_REQUIRES(ctx,
@@ -396,7 +448,7 @@ void PrefetchDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
     metrics::RecordTFDataAutotune(kDatasetName);
   }
 
-  *output = new Dataset(ctx, input, buffer_size);
+  *output = new Dataset(ctx, input, buffer_size, slack_period_);
 }
 
 namespace {
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.h b/tensorflow/core/kernels/data/prefetch_dataset_op.h
index 83206374946..d42e14373bd 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.h
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.h
@@ -25,7 +25,11 @@ namespace data {
 class PrefetchDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit PrefetchDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx) {}
+      : UnaryDatasetOpKernel(ctx) {
+    if (ctx->HasAttr("slack_period")) {
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("slack_period", &slack_period_));
+    }
+  }
 
  protected:
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
@@ -33,6 +37,7 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
 
  private:
   class Dataset;
+  int64 slack_period_ = 0;
 };
 
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op_test.cc b/tensorflow/core/kernels/data/prefetch_dataset_op_test.cc
index 4beb306d7fe..56dfbc510e8 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op_test.cc
@@ -38,9 +38,11 @@ class PrefetchDatasetOpTest : public DatasetOpsTestBase {
       const DataTypeVector &output_types,
       const std::vector<PartialTensorShape> &output_shapes,
       std::unique_ptr<OpKernel> *op_kernel) {
-    NodeDef node_def = test::function::NDef(
-        kNodeName, kOpName, {"input_dataset", "buffer_size"},
-        {{"output_types", output_types}, {"output_shapes", output_shapes}});
+    NodeDef node_def = test::function::NDef(kNodeName, kOpName,
+                                            {"input_dataset", "buffer_size"},
+                                            {{"output_types", output_types},
+                                             {"output_shapes", output_shapes},
+                                             {"slack_period", 0}});
     TF_RETURN_IF_ERROR(CreateOpKernel(node_def, op_kernel));
     return Status::OK();
   }
diff --git a/tensorflow/core/kernels/data/shard_dataset_op_test.cc b/tensorflow/core/kernels/data/shard_dataset_op_test.cc
new file mode 100644
index 00000000000..6da1ff3b570
--- /dev/null
+++ b/tensorflow/core/kernels/data/shard_dataset_op_test.cc
@@ -0,0 +1,821 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kNodeName[] = "shard_dataset";
+constexpr char kOpName[] = "ShardDataset";
+
+class ShardDatasetOpTest : public DatasetOpsTestBase {
+ protected:
+  // Creates a new `ShardDataset` op kernel.
+  Status CreateShardDatasetOpKernel(
+      bool require_non_empty, const DataTypeVector& output_types,
+      const std::vector<PartialTensorShape>& output_shapes,
+      std::unique_ptr<OpKernel>* op_kernel) {
+    NodeDef node_def = test::function::NDef(
+        kNodeName, kOpName, {"input_dataset", "num_shards", "index"},
+        {{"require_non_empty", require_non_empty},
+         {"output_types", output_types},
+         {"output_shapes", output_shapes}});
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def, op_kernel));
+    return Status::OK();
+  }
+
+  // Create a new `ShardDataset` op kernel context
+  Status CreateShardDatasetContext(
+      OpKernel* const op_kernel,
+      gtl::InlinedVector<TensorValue, 4>* const inputs,
+      std::unique_ptr<OpKernelContext>* context) {
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*op_kernel, *inputs));
+    TF_RETURN_IF_ERROR(CreateOpKernelContext(op_kernel, inputs, context));
+    return Status::OK();
+  }
+};
+
+struct RangeDatasetParam {
+  int64 start;
+  int64 end;
+  int64 step;
+};
+
+struct TestCase {
+  RangeDatasetParam range_dataset_param;
+  Tensor num_shards;
+  Tensor index;
+  bool require_non_empty;
+  std::vector<Tensor> expected_outputs;
+  DataTypeVector expected_output_dtypes;
+  std::vector<PartialTensorShape> expected_output_shapes;
+  int64 expected_cardinality;
+  std::vector<int> breakpoints;
+};
+
+// Test Case 1: simple case.
+TestCase TestCase1() {
+  return {/*range_data_param*/ {0, 10, 1},
+          /*num_shards*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
+          /*index*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+          /*require_non_empty*/ true,
+          /*expected_outputs*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {7})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 2,
+          /*breakpoints*/ {0, 1, 5}};
+}
+
+// Test Case 2: zero offset.
+TestCase TestCase2() {
+  return {/*range_data_param*/ {0, 10, 1},
+          /*num_shards*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
+          /*index*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
+          /*require_non_empty*/ true,
+          /*expected_outputs*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 2,
+          /*breakpoints*/ {0, 1, 5}};
+}
+
+// Test Case 3: iterator ends before first element.
+TestCase TestCase3() {
+  return {/*range_data_param*/ {0, 1, 1},
+          /*num_shards*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
+          /*index*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+          /*require_non_empty*/ true,
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {0, 1}};
+}
+
+// Test Case 4: larger num_shards.
+TestCase TestCase4() {
+  return {/*range_data_param*/ {0, 10, 1},
+          /*num_shards*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {7}),
+          /*index*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
+          /*require_non_empty*/ true,
+          /*expected_outputs*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 1,
+          /*breakpoints*/ {0, 5}};
+}
+
+// Test Case 5: index == num_shards.
+TestCase TestCase5() {
+  return {/*range_data_param*/ {0, 10, 1},
+          /*num_shards*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
+          /*index*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
+          /*require_non_empty*/ true,
+          /*expected_outputs*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {9})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 2,
+          /*breakpoints*/ {0, 1, 5}};
+}
+
+// Test Case 6: similar with test_case_5 but the number of outputs could not be
+// divided evenly by num_shards.
+TestCase TestCase6() {
+  return {/*range_data_param*/ {0, 10, 1},
+          /*num_shards*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {4}),
+          /*index*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3}),
+          /*require_non_empty*/ true,
+          /*expected_outputs*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {7})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 2,
+          /*breakpoints*/ {0, 1, 5}};
+}
+
+// Test Case 7: num_shard is larger than the cardinality of input dataset;
+// require_non_empty = false.
+TestCase TestCase7() {
+  return {/*range_data_param*/ {0, 10, 1},
+          /*num_shards*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {20}),
+          /*index*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
+          /*require_non_empty*/ false,
+          /*expected_outputs*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 1,
+          /*breakpoints*/ {0, 5}};
+}
+
+// Test Case 8: similar with test_case_7 but require_non_empty = true.
+TestCase NoElemForEachShardTestCase() {
+  return {/*range_data_param*/ {0, 10, 1},
+          /*num_shards*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {20}),
+          /*index*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
+          /*require_non_empty*/ true,
+          /*expected_outputs*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 1,
+          /*breakpoints*/ {0, 5}};
+}
+
+TestCase IndexGreaterNumShardsCase() {
+  return {/*range_data_param*/ {0, 10, 1},
+          /*num_shards*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
+          /*index*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {7}),
+          /*require_non_empty*/ true,
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {}};
+}
+
+TestCase NegativeIndexTestCase() {
+  return {/*range_data_param*/ {0, 10, 1},
+          /*num_shards*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {5}),
+          /*index*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {-3}),
+          /*require_non_empty*/ true,
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {}};
+}
+
+TestCase NegativeNumShardsTestCase() {
+  return {/*range_data_param*/ {0, 10, 1},
+          /*num_shards*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {-3}),
+          /*index*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+          /*require_non_empty*/ true,
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {}};
+}
+
+TestCase ZeroNumShardsTestCase() {
+  return {/*range_data_param*/ {0, 10, 1},
+          /*num_shards*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
+          /*index*/
+          DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+          /*require_non_empty*/ true,
+          /*expected_outputs*/ {},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({})},
+          /*expected_cardinality*/ 0,
+          /*breakpoints*/ {}};
+}
+
+class ParameterizedShardDatasetOpTest
+    : public ShardDatasetOpTest,
+      public ::testing::WithParamInterface<TestCase> {};
+
+TEST_P(ParameterizedShardDatasetOpTest, GetNext) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> shard_dataset_kernel;
+  TF_ASSERT_OK(CreateShardDatasetOpKernel(
+      test_case.require_non_empty, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &shard_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor num_shards = test_case.num_shards;
+  Tensor index = test_case.index;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &num_shards, &index});
+  std::unique_ptr<OpKernelContext> shard_dataset_context;
+  TF_ASSERT_OK(CreateShardDatasetContext(shard_dataset_kernel.get(), &inputs,
+                                         &shard_dataset_context));
+
+  DatasetBase* shard_dataset;
+  TF_ASSERT_OK(CreateDataset(shard_dataset_kernel.get(),
+                             shard_dataset_context.get(), &shard_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(shard_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(shard_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      shard_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  bool end_of_sequence = false;
+  auto expected_outputs_it = test_case.expected_outputs.begin();
+  std::vector<Tensor> out_tensors;
+  while (!end_of_sequence) {
+    TF_EXPECT_OK(
+        iterator->GetNext(iterator_ctx.get(), &out_tensors, &end_of_sequence));
+    if (!end_of_sequence) {
+      EXPECT_LT(expected_outputs_it, test_case.expected_outputs.end());
+      TF_EXPECT_OK(ExpectEqual(out_tensors.back(), *expected_outputs_it));
+      expected_outputs_it++;
+    }
+  }
+  EXPECT_EQ(expected_outputs_it, test_case.expected_outputs.end());
+}
+
+TEST_F(ShardDatasetOpTest, DatasetNodeName) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = TestCase1();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> shard_dataset_kernel;
+  TF_ASSERT_OK(CreateShardDatasetOpKernel(
+      test_case.require_non_empty, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &shard_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor num_shards = test_case.num_shards;
+  Tensor index = test_case.index;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &num_shards, &index});
+  std::unique_ptr<OpKernelContext> shard_dataset_context;
+  TF_ASSERT_OK(CreateShardDatasetContext(shard_dataset_kernel.get(), &inputs,
+                                         &shard_dataset_context));
+
+  DatasetBase* shard_dataset;
+  TF_ASSERT_OK(CreateDataset(shard_dataset_kernel.get(),
+                             shard_dataset_context.get(), &shard_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(shard_dataset);
+
+  EXPECT_EQ(shard_dataset->node_name(), kNodeName);
+}
+
+TEST_F(ShardDatasetOpTest, DatasetTypeString) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = TestCase1();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> shard_dataset_kernel;
+  TF_ASSERT_OK(CreateShardDatasetOpKernel(
+      test_case.require_non_empty, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &shard_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor num_shards = test_case.num_shards;
+  Tensor index = test_case.index;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &num_shards, &index});
+  std::unique_ptr<OpKernelContext> shard_dataset_context;
+  TF_ASSERT_OK(CreateShardDatasetContext(shard_dataset_kernel.get(), &inputs,
+                                         &shard_dataset_context));
+
+  DatasetBase* shard_dataset;
+  TF_ASSERT_OK(CreateDataset(shard_dataset_kernel.get(),
+                             shard_dataset_context.get(), &shard_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(shard_dataset);
+
+  EXPECT_EQ(shard_dataset->type_string(), kOpName);
+}
+
+TEST_P(ParameterizedShardDatasetOpTest, DatasetOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> shard_dataset_kernel;
+  TF_ASSERT_OK(CreateShardDatasetOpKernel(
+      test_case.require_non_empty, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &shard_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor num_shards = test_case.num_shards;
+  Tensor index = test_case.index;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &num_shards, &index});
+  std::unique_ptr<OpKernelContext> shard_dataset_context;
+  TF_ASSERT_OK(CreateShardDatasetContext(shard_dataset_kernel.get(), &inputs,
+                                         &shard_dataset_context));
+
+  DatasetBase* shard_dataset;
+  TF_ASSERT_OK(CreateDataset(shard_dataset_kernel.get(),
+                             shard_dataset_context.get(), &shard_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(shard_dataset);
+
+  TF_EXPECT_OK(VerifyTypesMatch(shard_dataset->output_dtypes(),
+                                test_case.expected_output_dtypes));
+}
+
+TEST_P(ParameterizedShardDatasetOpTest, DatasetOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> shard_dataset_kernel;
+  TF_ASSERT_OK(CreateShardDatasetOpKernel(
+      test_case.require_non_empty, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &shard_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor num_shards = test_case.num_shards;
+  Tensor index = test_case.index;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &num_shards, &index});
+  std::unique_ptr<OpKernelContext> shard_dataset_context;
+  TF_ASSERT_OK(CreateShardDatasetContext(shard_dataset_kernel.get(), &inputs,
+                                         &shard_dataset_context));
+
+  DatasetBase* shard_dataset;
+  TF_ASSERT_OK(CreateDataset(shard_dataset_kernel.get(),
+                             shard_dataset_context.get(), &shard_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(shard_dataset);
+
+  TF_EXPECT_OK(VerifyShapesCompatible(shard_dataset->output_shapes(),
+                                      test_case.expected_output_shapes));
+}
+
+TEST_P(ParameterizedShardDatasetOpTest, Cardinality) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> shard_dataset_kernel;
+  TF_ASSERT_OK(CreateShardDatasetOpKernel(
+      test_case.require_non_empty, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &shard_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor num_shards = test_case.num_shards;
+  Tensor index = test_case.index;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &num_shards, &index});
+  std::unique_ptr<OpKernelContext> shard_dataset_context;
+  TF_ASSERT_OK(CreateShardDatasetContext(shard_dataset_kernel.get(), &inputs,
+                                         &shard_dataset_context));
+
+  DatasetBase* shard_dataset;
+  TF_ASSERT_OK(CreateDataset(shard_dataset_kernel.get(),
+                             shard_dataset_context.get(), &shard_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(shard_dataset);
+
+  EXPECT_EQ(shard_dataset->Cardinality(), test_case.expected_cardinality);
+}
+
+TEST_P(ParameterizedShardDatasetOpTest, DatasetSave) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> shard_dataset_kernel;
+  TF_ASSERT_OK(CreateShardDatasetOpKernel(
+      test_case.require_non_empty, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &shard_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor num_shards = test_case.num_shards;
+  Tensor index = test_case.index;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &num_shards, &index});
+  std::unique_ptr<OpKernelContext> shard_dataset_context;
+  TF_ASSERT_OK(CreateShardDatasetContext(shard_dataset_kernel.get(), &inputs,
+                                         &shard_dataset_context));
+
+  DatasetBase* shard_dataset;
+  TF_ASSERT_OK(CreateDataset(shard_dataset_kernel.get(),
+                             shard_dataset_context.get(), &shard_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(shard_dataset);
+
+  std::unique_ptr<SerializationContext> serialization_context;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(shard_dataset->Save(serialization_context.get(), &writer));
+  TF_ASSERT_OK(writer.Flush());
+}
+
+TEST_P(ParameterizedShardDatasetOpTest, IteratorOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> shard_dataset_kernel;
+  TF_ASSERT_OK(CreateShardDatasetOpKernel(
+      test_case.require_non_empty, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &shard_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor num_shards = test_case.num_shards;
+  Tensor index = test_case.index;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &num_shards, &index});
+  std::unique_ptr<OpKernelContext> shard_dataset_context;
+  TF_ASSERT_OK(CreateShardDatasetContext(shard_dataset_kernel.get(), &inputs,
+                                         &shard_dataset_context));
+
+  DatasetBase* shard_dataset;
+  TF_ASSERT_OK(CreateDataset(shard_dataset_kernel.get(),
+                             shard_dataset_context.get(), &shard_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(shard_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(shard_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      shard_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  TF_EXPECT_OK(VerifyTypesMatch(iterator->output_dtypes(),
+                                test_case.expected_output_dtypes));
+}
+
+TEST_P(ParameterizedShardDatasetOpTest, IteratorOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> shard_dataset_kernel;
+  TF_ASSERT_OK(CreateShardDatasetOpKernel(
+      test_case.require_non_empty, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &shard_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor num_shards = test_case.num_shards;
+  Tensor index = test_case.index;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &num_shards, &index});
+  std::unique_ptr<OpKernelContext> shard_dataset_context;
+  TF_ASSERT_OK(CreateShardDatasetContext(shard_dataset_kernel.get(), &inputs,
+                                         &shard_dataset_context));
+
+  DatasetBase* shard_dataset;
+  TF_ASSERT_OK(CreateDataset(shard_dataset_kernel.get(),
+                             shard_dataset_context.get(), &shard_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(shard_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(shard_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      shard_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  TF_EXPECT_OK(VerifyShapesCompatible(iterator->output_shapes(),
+                                      test_case.expected_output_shapes));
+}
+
+TEST_F(ShardDatasetOpTest, IteratorOutputPrefix) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = TestCase1();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> shard_dataset_kernel;
+  TF_ASSERT_OK(CreateShardDatasetOpKernel(
+      test_case.require_non_empty, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &shard_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor num_shards = test_case.num_shards;
+  Tensor index = test_case.index;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &num_shards, &index});
+  std::unique_ptr<OpKernelContext> shard_dataset_context;
+  TF_ASSERT_OK(CreateShardDatasetContext(shard_dataset_kernel.get(), &inputs,
+                                         &shard_dataset_context));
+
+  DatasetBase* shard_dataset;
+  TF_ASSERT_OK(CreateDataset(shard_dataset_kernel.get(),
+                             shard_dataset_context.get(), &shard_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(shard_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(shard_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      shard_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  EXPECT_EQ(iterator->prefix(), "Iterator::Shard");
+}
+
+TEST_P(ParameterizedShardDatasetOpTest, Roundtrip) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> shard_dataset_kernel;
+  TF_ASSERT_OK(CreateShardDatasetOpKernel(
+      test_case.require_non_empty, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &shard_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor num_shards = test_case.num_shards;
+  Tensor index = test_case.index;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &num_shards, &index});
+  std::unique_ptr<OpKernelContext> shard_dataset_context;
+  TF_ASSERT_OK(CreateShardDatasetContext(shard_dataset_kernel.get(), &inputs,
+                                         &shard_dataset_context));
+
+  DatasetBase* shard_dataset;
+  TF_ASSERT_OK(CreateDataset(shard_dataset_kernel.get(),
+                             shard_dataset_context.get(), &shard_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(shard_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(shard_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      shard_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  int cur_iteration = 0;
+  const std::vector<int>& breakpoints = test_case.breakpoints;
+  for (int breakpoint : breakpoints) {
+    VariantTensorData data;
+    VariantTensorDataWriter writer(&data);
+    TF_EXPECT_OK(iterator->Save(serialization_ctx.get(), &writer));
+    TF_EXPECT_OK(writer.Flush());
+    VariantTensorDataReader reader(&data);
+    TF_EXPECT_OK(RestoreIterator(iterator_ctx.get(), &reader, "Iterator",
+                                 *shard_dataset, &iterator));
+
+    while (cur_iteration <= breakpoint) {
+      std::vector<Tensor> next;
+      TF_EXPECT_OK(
+          iterator->GetNext(iterator_ctx.get(), &next, &end_of_sequence));
+      out_tensors.insert(out_tensors.end(), next.begin(), next.end());
+      cur_iteration++;
+    }
+  }
+
+  TF_EXPECT_OK(ExpectEqual(out_tensors, test_case.expected_outputs,
+                           /*compare_order*/ true));
+}
+
+INSTANTIATE_TEST_SUITE_P(ShardDatasetOpTest, ParameterizedShardDatasetOpTest,
+                         ::testing::ValuesIn(std::vector<TestCase>(
+                             {TestCase1(), TestCase2(), TestCase3(),
+                              TestCase4(), TestCase5(), TestCase6(),
+                              TestCase7()})));
+
+TEST_F(ShardDatasetOpTest, InvalidArguments) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::vector<TestCase> test_cases = {
+      IndexGreaterNumShardsCase(), NegativeIndexTestCase(),
+      NegativeNumShardsTestCase(), ZeroNumShardsTestCase()};
+  for (const auto& test_case : test_cases) {
+    std::unique_ptr<OpKernel> shard_dataset_kernel;
+    TF_ASSERT_OK(CreateShardDatasetOpKernel(
+        test_case.require_non_empty, test_case.expected_output_dtypes,
+        test_case.expected_output_shapes, &shard_dataset_kernel));
+
+    DatasetBase* range_dataset;
+    TF_ASSERT_OK(CreateRangeDataset<int64>(
+        test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+        test_case.range_dataset_param.step, "range", &range_dataset));
+    Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+    TF_ASSERT_OK(
+        StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+    Tensor num_shards = test_case.num_shards;
+    Tensor index = test_case.index;
+    gtl::InlinedVector<TensorValue, 4> inputs(
+        {&range_dataset_tensor, &num_shards, &index});
+    std::unique_ptr<OpKernelContext> shard_dataset_context;
+    TF_ASSERT_OK(CreateShardDatasetContext(shard_dataset_kernel.get(), &inputs,
+                                           &shard_dataset_context));
+
+    DatasetBase* shard_dataset;
+    EXPECT_EQ(CreateDataset(shard_dataset_kernel.get(),
+                            shard_dataset_context.get(), &shard_dataset)
+                  .code(),
+              tensorflow::error::INVALID_ARGUMENT);
+  }
+}
+
+TEST_F(ShardDatasetOpTest, NoElemForEachShard) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  TestCase test_case = NoElemForEachShardTestCase();
+
+  std::unique_ptr<OpKernel> shard_dataset_kernel;
+  TF_ASSERT_OK(CreateShardDatasetOpKernel(
+      test_case.require_non_empty, test_case.expected_output_dtypes,
+      test_case.expected_output_shapes, &shard_dataset_kernel));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_dataset_param.start, test_case.range_dataset_param.end,
+      test_case.range_dataset_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+
+  Tensor num_shards = test_case.num_shards;
+  Tensor index = test_case.index;
+  gtl::InlinedVector<TensorValue, 4> inputs(
+      {&range_dataset_tensor, &num_shards, &index});
+  std::unique_ptr<OpKernelContext> shard_dataset_context;
+  TF_ASSERT_OK(CreateShardDatasetContext(shard_dataset_kernel.get(), &inputs,
+                                         &shard_dataset_context));
+
+  DatasetBase* shard_dataset;
+  TF_ASSERT_OK(CreateDataset(shard_dataset_kernel.get(),
+                             shard_dataset_context.get(), &shard_dataset));
+  core::ScopedUnref scoped_unref_batch_dataset(shard_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(shard_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      shard_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+
+  EXPECT_EQ(
+      iterator->GetNext(iterator_ctx.get(), &out_tensors, &end_of_sequence)
+          .code(),
+      tensorflow::error::INVALID_ARGUMENT);
+}
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
index 287a7c946c0..add526704f8 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -364,7 +364,7 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
-    int64 buffer_size;
+    int64 buffer_size = 0;
     OP_REQUIRES_OK(
         ctx, ParseScalarArgument<int64>(ctx, "buffer_size", &buffer_size));
     OP_REQUIRES(
@@ -637,7 +637,7 @@ class ShuffleAndRepeatDatasetOp : public ShuffleDatasetOpBase {
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
-    int64 buffer_size;
+    int64 buffer_size = 0;
     OP_REQUIRES_OK(
         ctx, ParseScalarArgument<int64>(ctx, "buffer_size", &buffer_size));
     OP_REQUIRES(
diff --git a/tensorflow/core/kernels/data/window_dataset_op.cc b/tensorflow/core/kernels/data/window_dataset_op.cc
index 150385c96ca..bfe2ef35280 100644
--- a/tensorflow/core/kernels/data/window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/window_dataset_op.cc
@@ -298,7 +298,7 @@ class WindowDatasetOp : public UnaryDatasetOpKernel {
           input_impl_.reset();
         }
         // Restore buffer.
-        int64 buffer_size;
+        int64 buffer_size = 0;
         TF_RETURN_IF_ERROR(
             reader->ReadScalar(strings::StrCat("buffer_size"), &buffer_size));
         buffer_.resize(buffer_size);
diff --git a/tensorflow/core/kernels/dense_update_functor.cc b/tensorflow/core/kernels/dense_update_functor.cc
index 3ed3794e01d..4d7eafd4f72 100644
--- a/tensorflow/core/kernels/dense_update_functor.cc
+++ b/tensorflow/core/kernels/dense_update_functor.cc
@@ -105,7 +105,7 @@ struct DenseUpdate<CPUDevice, string, ASSIGN> {
 
 INSTANTIATE_GET_VARIANT_COPY_FN(CPUDevice, TF_CALL_ALL_TYPES, CPU_DENSE_COPY);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define GPU_DENSE_COPY(T)                                                \
   case DataTypeToEnum<T>::value: {                                       \
     functor::DenseUpdate<GPUDevice, T, ASSIGN> copy_functor_;            \
@@ -121,7 +121,7 @@ INSTANTIATE_GET_VARIANT_COPY_FN(GPUDevice, TF_CALL_GPU_AND_ADDITIONAL_TYPES,
                                 GPU_DENSE_COPY);
 #undef TF_CALL_GPU_AND_ADDITIONAL_TYPES
 #undef GPU_DENSE_COPY
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #undef CPU_DENSE_COPY
 #undef INSTANTIATE_GET_VARIANT_COPY_FN
diff --git a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
index 25c57384ca9..daf8a7380e0 100644
--- a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
@@ -72,4 +72,4 @@ TF_CALL_int8(DEFINE_GPU_KERNELS);
 
 }  // end namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/dense_update_ops.cc b/tensorflow/core/kernels/dense_update_ops.cc
index f942b1a8a92..c68f1891c39 100644
--- a/tensorflow/core/kernels/dense_update_ops.cc
+++ b/tensorflow/core/kernels/dense_update_ops.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 #endif
 
@@ -102,7 +102,7 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
 TF_CALL_quint16(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // Only register 'Assign' on GPU for the subset of types also supported by
 // 'Variable' (see variable_ops.cc.)
 #define REGISTER_GPU_KERNELS(type)                                 \
@@ -113,7 +113,7 @@ TF_CALL_quint16(REGISTER_KERNELS);
 TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
 TF_CALL_int64(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNELS(type)                                 \
@@ -136,7 +136,7 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS);
 TF_CALL_NUMBER_TYPES(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER_GPU_KERNELS(type)                                    \
   REGISTER_KERNEL_BUILDER(                                            \
       Name("AssignAdd").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
@@ -147,7 +147,7 @@ TF_CALL_NUMBER_TYPES(REGISTER_KERNELS);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 TF_CALL_int64(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
-#endif  // end GOOGLE_CUDA
+#endif  // end GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNELS(type)                                    \
diff --git a/tensorflow/core/kernels/depthtospace_op_gpu.cu.cc b/tensorflow/core/kernels/depthtospace_op_gpu.cu.cc
index ea998c5fe79..2abda846fd6 100644
--- a/tensorflow/core/kernels/depthtospace_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthtospace_op_gpu.cu.cc
@@ -160,7 +160,7 @@ struct DepthToSpaceOpFunctor<GPUDevice, T, FORMAT_NHWC> {
     if (total_count == 0) {
       return;
     }
-    CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
+    GpuLaunchConfig config = GetCudaLaunchConfig(total_count, d);
     TF_CHECK_OK(CudaLaunchKernel(
         D2S_NHWC<T>, config.block_count, config.thread_per_block, 0, d.stream(),
         config.virtual_thread_count, input.data(), block_size, batch_size,
@@ -194,7 +194,7 @@ struct DepthToSpaceOpFunctor<GPUDevice, T, FORMAT_NCHW> {
       if (total_count == 0) {
         return;
       }
-      CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
+      GpuLaunchConfig config = GetCudaLaunchConfig(total_count, d);
       switch (block_size) {
         case 2:
           TF_CHECK_OK(CudaLaunchKernel(
@@ -225,7 +225,7 @@ struct DepthToSpaceOpFunctor<GPUDevice, T, FORMAT_NCHW> {
     if (total_count == 0) {
       return;
     }
-    auto config = GetCudaLaunchConfig(total_count, d);
+    auto config = GetGpuLaunchConfig(total_count, d);
     TF_CHECK_OK(CudaLaunchKernel(
         D2S_NCHW<T>, config.block_count, config.thread_per_block, 0, d.stream(),
         config.virtual_thread_count, input.data(), block_size, input_width,
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index ab98cacd1a1..b29e8323332 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -38,7 +38,7 @@ limitations under the License.
 #include "tensorflow/core/util/work_sharder.h"
 
 #if GOOGLE_CUDA
-#include "cuda/include/cudnn.h"
+#include "third_party/gpus/cudnn/cudnn.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc
index 11c2b31633d..ceaeaac21de 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@@ -39,7 +39,7 @@ limitations under the License.
 #include "tensorflow/core/util/work_sharder.h"
 
 #if GOOGLE_CUDA
-#include "cuda/include/cudnn.h"
+#include "third_party/gpus/cudnn/cudnn.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.h b/tensorflow/core/kernels/depthwise_conv_op_gpu.h
index 5c88c980d4d..33ff78b4c56 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.h
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.h
@@ -656,7 +656,7 @@ Status LaunchDepthwiseConv2dGPUSmall(OpKernelContext* ctx,
       kBlockDepth * (tile_pixels + filter_pixels) * sizeof(S);
   const int num_outputs = args.out_rows * args.out_cols * block_count;
   auto device = ctx->eigen_gpu_device();
-  CudaLaunchConfig config = GetCudaLaunchConfigFixedBlockSize(
+  GpuLaunchConfig config = GetCudaLaunchConfigFixedBlockSize(
       num_outputs, device, kernel, shared_memory_size,
       block_dim.x * block_dim.y * block_dim.z);
   TF_CHECK_OK(CudaLaunchKernel(kernel, config.block_count, block_dim,
@@ -744,8 +744,8 @@ Status LaunchDepthwiseConv2dGPU(OpKernelContext* ctx, const DepthwiseArgs& args,
   const int num_outputs =
       args.batch * args.out_rows * args.out_cols * args.out_depth;
   auto device = ctx->eigen_gpu_device();
-  CudaLaunchConfig config =
-      GetCudaLaunchConfig(num_outputs, device, kernel, 0, 0);
+  GpuLaunchConfig config =
+      GetGpuLaunchConfig(num_outputs, device, kernel, 0, 0);
   // The compile-time constant version runs faster with a single block.
   const int max_block_count = kKnownFilterWidth < 0 || kKnownFilterHeight < 0 ||
                                       kKnownDepthMultiplier < 0
@@ -967,8 +967,8 @@ Status LaunchDepthwiseConv2dBackpropInputGPU(OpKernelContext* ctx,
   const int num_in_backprop =
       args.batch * args.in_rows * args.in_cols * args.in_depth;
   auto device = ctx->eigen_gpu_device();
-  CudaLaunchConfig config =
-      GetCudaLaunchConfig(num_in_backprop, device, kernel, 0, 0);
+  GpuLaunchConfig config =
+      GetGpuLaunchConfig(num_in_backprop, device, kernel, 0, 0);
   TF_CHECK_OK(CudaLaunchKernel(
       kernel, config.block_count, config.thread_per_block, 0, device.stream(),
       args, out_backprop, filter, in_backprop, num_in_backprop));
@@ -1611,7 +1611,7 @@ Status TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
                                      " is not supported");
   }
   const int num_out_backprop = args.out_rows * args.out_cols * block_count;
-  CudaLaunchConfig config = GetCudaLaunchConfigFixedBlockSize(
+  GpuLaunchConfig config = GetCudaLaunchConfigFixedBlockSize(
       num_out_backprop, device, kernel, shared_memory_size,
       block_dim.x * block_dim.y * block_dim.z);
   TF_CHECK_OK(CudaLaunchKernel(kernel, config.block_count, block_dim,
@@ -1715,8 +1715,8 @@ Status LaunchDepthwiseConv2dBackpropFilterGPU(
   const int num_out_backprop =
       args.batch * args.out_rows * args.out_cols * args.out_depth;
   auto device = ctx->eigen_gpu_device();
-  CudaLaunchConfig config =
-      GetCudaLaunchConfig(num_out_backprop, device, kernel, 0, 0);
+  GpuLaunchConfig config =
+      GetGpuLaunchConfig(num_out_backprop, device, kernel, 0, 0);
   TF_CHECK_OK(CudaLaunchKernel(
       kernel, config.block_count, config.thread_per_block, 0, device.stream(),
       args, out_backprop, input, filter_backprop, num_out_backprop));
diff --git a/tensorflow/core/kernels/determinant_op_gpu.cu.cc b/tensorflow/core/kernels/determinant_op_gpu.cu.cc
index 65c4981ebdc..387ea3b6607 100644
--- a/tensorflow/core/kernels/determinant_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/determinant_op_gpu.cu.cc
@@ -128,7 +128,7 @@ struct DeterminantFromPivotedLUFunctor<GPUDevice, Scalar> {
                   int* info) {
     const int64 num_matrices = output.size();
     const int64 n = lu_factor.dimension(2);
-    CudaLaunchConfig config = GetCudaLaunchConfig(num_matrices, device);
+    GpuLaunchConfig config = GetCudaLaunchConfig(num_matrices, device);
 
     TF_CHECK_OK(CudaLaunchKernel(
         DeterminantFromPivotedLUKernel<Scalar, /*compute_log_abs_det=*/false>,
@@ -151,7 +151,7 @@ struct LogDeterminantFromPivotedLUFunctor<GPUDevice, Scalar> {
                   typename TTypes<Scalar, 1>::Tensor log_abs_det) {
     const int64 num_matrices = sign.size();
     const int64 n = lu_factor.dimension(2);
-    CudaLaunchConfig config = GetCudaLaunchConfig(num_matrices, device);
+    GpuLaunchConfig config = GetCudaLaunchConfig(num_matrices, device);
     TF_CHECK_OK(CudaLaunchKernel(
         DeterminantFromPivotedLUKernel<Scalar, /*compute_log_abs_det=*/true>,
         config.block_count, config.thread_per_block, 0, device.stream(),
diff --git a/tensorflow/core/kernels/diag_op_gpu.cu.cc b/tensorflow/core/kernels/diag_op_gpu.cu.cc
index e188e85b6e3..7ad967fd92f 100644
--- a/tensorflow/core/kernels/diag_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/diag_op_gpu.cu.cc
@@ -50,7 +50,7 @@ struct DiagFunctor<GPUDevice, T> {
       return Status::OK();
     }
 
-    // CudaLaunchConfig uses an int for virtual_thread_count,
+    // GpuLaunchConfig uses an int for virtual_thread_count,
     // so this may overflow for `size*size` in extreme cases,
     // here is checking the multiplication overflow for integer.
     if (size && (int(size * size) / size) != size) {
@@ -60,8 +60,8 @@ struct DiagFunctor<GPUDevice, T> {
 
     // Launch the GPU kernel.
     const GPUDevice& device = context->eigen_device<GPUDevice>();
-    CudaLaunchConfig diag_config =
-        GetCudaLaunchConfig(virtual_thread_count, device);
+    GpuLaunchConfig diag_config =
+        GetGpuLaunchConfig(virtual_thread_count, device);
     TF_CHECK_OK(
         CudaLaunchKernel(DiagCudaKernel<T>, diag_config.block_count,
                          diag_config.thread_per_block, 0, device.stream(),
@@ -102,7 +102,7 @@ struct DiagPartFunctor<GPUDevice, T> {
     const GPUDevice& device = context->eigen_device<GPUDevice>();
 
     // Extract the diagonal elements.
-    CudaLaunchConfig diag_config = GetCudaLaunchConfig(size, device);
+    GpuLaunchConfig diag_config = GetCudaLaunchConfig(size, device);
     TF_CHECK_OK(
         CudaLaunchKernel(DiagPartCudaKernel<T>, diag_config.block_count,
                          diag_config.thread_per_block, 0, device.stream(),
diff --git a/tensorflow/core/kernels/dilation_ops_gpu.cu.cc b/tensorflow/core/kernels/dilation_ops_gpu.cu.cc
index c0d477ccd49..588f5677f40 100644
--- a/tensorflow/core/kernels/dilation_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/dilation_ops_gpu.cu.cc
@@ -193,7 +193,7 @@ struct Dilation<GPUDevice, T> {
     const int output_cols = output.dimension(2);
 
     const int total_count = batch * output_rows * output_cols * depth;
-    CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
+    GpuLaunchConfig config = GetCudaLaunchConfig(total_count, d);
 
     TF_CHECK_OK(CudaLaunchKernel(
         DilationKernel<T>, config.block_count, config.thread_per_block, 0,
@@ -224,18 +224,18 @@ struct DilationBackpropInput<GPUDevice, T> {
     const int output_cols = out_backprop.dimension(2);
 
     int total_count;
-    CudaLaunchConfig config;
+    GpuLaunchConfig config;
 
     // Initialize in_backprop with all zeros.
     total_count = batch * input_rows * input_cols * depth;
-    config = GetCudaLaunchConfig(total_count, d);
+    config = GetGpuLaunchConfig(total_count, d);
     TF_CHECK_OK(CudaLaunchKernel(SetZero<T>, config.block_count,
                                  config.thread_per_block, 0, d.stream(),
                                  total_count, in_backprop.data()));
 
     // Accumulate.
     total_count = batch * output_rows * output_cols * depth;
-    config = GetCudaLaunchConfig(total_count, d);
+    config = GetGpuLaunchConfig(total_count, d);
     TF_CHECK_OK(CudaLaunchKernel(
         DilationBackpropInputKernel<T>, config.block_count,
         config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
@@ -266,18 +266,18 @@ struct DilationBackpropFilter<GPUDevice, T> {
     const int output_cols = out_backprop.dimension(2);
 
     int total_count;
-    CudaLaunchConfig config;
+    GpuLaunchConfig config;
 
     // Initialize filter_backprop with all zeros.
     total_count = filter_rows * filter_cols * depth;
-    config = GetCudaLaunchConfig(total_count, d);
+    config = GetGpuLaunchConfig(total_count, d);
     TF_CHECK_OK(CudaLaunchKernel(SetZero<T>, config.block_count,
                                  config.thread_per_block, 0, d.stream(),
                                  total_count, filter_backprop.data()));
 
     // Accumulate.
     total_count = batch * output_rows * output_cols * depth;
-    config = GetCudaLaunchConfig(total_count, d);
+    config = GetGpuLaunchConfig(total_count, d);
     TF_CHECK_OK(CudaLaunchKernel(
         DilationBackpropFilterKernel<T>, config.block_count,
         config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
diff --git a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
index 77ea2234bc9..24cd1b62ce0 100644
--- a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
@@ -78,7 +78,7 @@ __global__ void MoveValuesKernel(const int32* keys, const int32* values,
 template <typename T>
 void RangeInit(const GPUDevice& d, const T start, const T delta,
                const int32 size, typename TTypes<T>::Flat out) {
-  CudaLaunchConfig config = GetCudaLaunchConfig(size, d);
+  GpuLaunchConfig config = GetCudaLaunchConfig(size, d);
   TF_CHECK_OK(CudaLaunchKernel(RangeInitKernel<T>, config.block_count,
                                config.thread_per_block, 0, d.stream(), start,
                                delta, size, out.data()));
@@ -93,7 +93,7 @@ void MoveValues(const GPUDevice& d, int32* keys, int32* values, int32* num_runs,
   // This is valid for correct inputs, because then out_size >= *num_runs.
   // For wrong inputs, we may have out_size < *num_runs. In this case we will
   // only handle the first out_size values.
-  CudaLaunchConfig config = GetCudaLaunchConfig(out_size, d);
+  GpuLaunchConfig config = GetCudaLaunchConfig(out_size, d);
   TF_CHECK_OK(CudaLaunchKernel(MoveValuesKernel, config.block_count,
                                config.thread_per_block, 0, d.stream(), keys,
                                values, num_runs, out_size, out));
@@ -103,7 +103,7 @@ template <typename T>
 void CallGatherKernel(const GPUDevice& d, const T* params, const int32* indices,
                       T* out, int64 gather_dim_size, int64 indices_size,
                       int64 slice_size, int64 out_size) {
-  CudaLaunchConfig config = GetCudaLaunchConfig(out_size, d);
+  GpuLaunchConfig config = GetCudaLaunchConfig(out_size, d);
   TF_CHECK_OK(CudaLaunchKernel(
       GatherOpKernel<T, int32, true>, config.block_count,
       config.thread_per_block, 0, d.stream(), params, indices, out,
diff --git a/tensorflow/core/kernels/dynamic_stitch_op_gpu.cu.cc b/tensorflow/core/kernels/dynamic_stitch_op_gpu.cu.cc
index 8cfceee650a..111b6a0a90c 100644
--- a/tensorflow/core/kernels/dynamic_stitch_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/dynamic_stitch_op_gpu.cu.cc
@@ -55,7 +55,7 @@ void DynamicStitchGPUImpl(const Eigen::GpuDevice& gpu_device,
                           const GpuDeviceArrayStruct<const T*>& input_ptrs,
                           T* output) {
   const int32 output_size = first_dim_size * slice_size;
-  auto config = GetCudaLaunchConfig(output_size, gpu_device);
+  auto config = GetGpuLaunchConfig(output_size, gpu_device);
 
   TF_CHECK_OK(CudaLaunchKernel(DynamicStitchKernel<T>, config.block_count,
                                config.thread_per_block, 0, gpu_device.stream(),
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions-inl.h b/tensorflow/core/kernels/eigen_spatial_convolutions-inl.h
index b6ab0768763..324e7ac58bd 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions-inl.h
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions-inl.h
@@ -23,8 +23,8 @@ namespace internal {
 
 // TensorEvaluatorHasPartialPacket<TensorEvaluatorType, PacketType, IndexType>
 // provides `value` that is true if TensorEvaluatorType has `PacketType
-// partialPacket(IndexType, unpacket_traits<PacketType>::mask_t) const` and if
-// the PacketType supports masked load.
+// partialPacket<PacketType>(IndexType, unpacket_traits<PacketType>::mask_t)
+// const` and if the PacketType supports masked load.
 //
 // Partial packets are used to:
 //
@@ -42,12 +42,13 @@ class TensorEvaluatorHasPartialPacket {
   static auto functionExistsSfinae(
       typename std::enable_if<
           unpacket_traits<PacketT>::masked_load_available &&
-          std::is_same<
-              PacketT,
-              decltype(std::declval<const TensorEvaluatorT>().partialPacket(
-                  std::declval<IndexT>(),
-                  std::declval<typename unpacket_traits<PacketT>::mask_t>()))>::
-              value>::type*) -> std::true_type;
+          std::is_same<PacketT,
+                       decltype(std::declval<const TensorEvaluatorT>()
+                                    .template partialPacket<PacketT>(
+                                        std::declval<IndexT>(),
+                                        std::declval<typename unpacket_traits<
+                                            PacketT>::mask_t>()))>::value>::
+          type*) -> std::true_type;
 
   template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
   static auto functionExistsSfinae(...) -> std::false_type;
@@ -435,8 +436,8 @@ class TensorContractionInputMapper<
       const Index depth = patchId - patchOffsets[0] * patchDepth();
       const Index inputIndex = depth + inputRows[0] * m_rowInputStride +
                                inputCol * m_colInputStride + otherIndex;
-      return m_impl.partialPacket(inputIndex - span[0],
-                                  mask<Packet>(span[0], span[1] + 1));
+      return m_impl.template partialPacket<Packet>(
+          inputIndex - span[0], mask<Packet>(span[0], span[1] + 1));
     } else {
       // Using slow path for this partial packet.
       // We need to load elements starting from index span[0] all the way upto
@@ -920,7 +921,7 @@ class TensorContractionSubMapper<
   partialPacketNoPadding(const Index depth, const Index baseIndex,
                          Index num_coeffs) const {
     const Index inputIndex = depth + baseIndex;
-    return m_base_mapper.m_impl.template partialPacket(
+    return m_base_mapper.m_impl.template partialPacket<PacketT>(
         inputIndex, mask<PacketT>(0, num_coeffs));
   }
   EIGEN_DEVICE_FUNC
diff --git a/tensorflow/core/kernels/eye_functor_gpu.cu.cc b/tensorflow/core/kernels/eye_functor_gpu.cu.cc
index 2268d0d2027..358584df51f 100644
--- a/tensorflow/core/kernels/eye_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/eye_functor_gpu.cu.cc
@@ -51,7 +51,7 @@ struct EyeFunctor<GPUDevice, Scalar> {
     const int batch_size = matrix_batch.dimension(0);
     const int m = matrix_batch.dimension(1);
     const int n = matrix_batch.dimension(2);
-    CudaLaunchConfig config = GetCudaLaunchConfig(batch_size * m * n, device);
+    GpuLaunchConfig config = GetCudaLaunchConfig(batch_size * m * n, device);
     TF_CHECK_OK(CudaLaunchKernel(EyeKernel<Scalar>, config.block_count,
                                  config.thread_per_block, 0, device.stream(),
                                  config.virtual_thread_count, batch_size, m, n,
diff --git a/tensorflow/core/kernels/fingerprint_op.cc b/tensorflow/core/kernels/fingerprint_op.cc
new file mode 100644
index 00000000000..20529326b3d
--- /dev/null
+++ b/tensorflow/core/kernels/fingerprint_op.cc
@@ -0,0 +1,136 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstddef>
+#include <string>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/byte_order.h"
+#include "tensorflow/core/platform/fingerprint.h"
+
+namespace tensorflow {
+namespace {
+template <typename T>
+inline void CopyToBuffer(const T& value, uint8* output) {
+  // Memcpy to string is endian-dependent. We choose little-endian as
+  // standard. On big-endian machines, bytes should be reversed.
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  static_assert(port::kLittleEndian, "");
+  std::memcpy(output, &value, sizeof(value));
+#else
+  static_assert(!port::kLittleEndian, "");
+  std::reverse_copy(reinterpret_cast<const uint8*>(&value),
+                    reinterpret_cast<const uint8*>(&value + 1), output);
+#endif
+}
+
+void FarmhashFingerprint64(TTypes<uint8, 2>::ConstTensor input,
+                           TTypes<uint8, 2>::Matrix output) {
+  DCHECK_EQ(output.dimension(0), input.dimension(0));
+  DCHECK_EQ(output.dimension(1), sizeof(uint64));
+  for (int64 i = 0; i < output.dimension(0); ++i) {
+    const uint64 fingerprint =
+        Fingerprint64({reinterpret_cast<const char*>(&input(i, 0)),
+                       static_cast<std::size_t>(input.dimension(1))});
+    CopyToBuffer(fingerprint, &output(i, 0));
+  }
+}
+
+void FarmhashFingerprint64(TTypes<string>::ConstFlat input,
+                           TTypes<uint8, 2>::Matrix output) {
+  DCHECK_EQ(output.dimension(0), input.dimension(0));
+  DCHECK_EQ(output.dimension(1), sizeof(uint64));
+  for (int64 i = 0; i < input.dimension(0); ++i) {
+    const uint64 fingerprint =
+        Fingerprint64({input(i).data(), input(i).size()});
+    CopyToBuffer(fingerprint, &output(i, 0));
+  }
+}
+
+class FingerprintOp : public OpKernel {
+ public:
+  explicit FingerprintOp(OpKernelConstruction* context) : OpKernel(context) {
+    DataType dtype;
+    OP_REQUIRES_OK(context, context->GetAttr("T", &dtype));
+    OP_REQUIRES(context, DataTypeCanUseMemcpy(dtype) || dtype == DT_STRING,
+                errors::InvalidArgument("Data type not supported: ",
+                                        DataTypeString(dtype)));
+  }
+
+  void Compute(tensorflow::OpKernelContext* context) override {
+    const Tensor& method_tensor = context->input(1);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(method_tensor.shape()),
+                errors::InvalidArgument("`method` should be a scalar string: ",
+                                        method_tensor.shape()));
+    // For now, farmhash64 is the only function supported.
+    const string& method = method_tensor.scalar<string>()();
+    OP_REQUIRES(
+        context, method == "farmhash64",
+        errors::InvalidArgument("Unsupported fingerprint method: ", method));
+
+    const Tensor& input = context->input(0);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsVectorOrHigher(input.shape()),
+        errors::InvalidArgument("`data` should have at least one dimension: ",
+                                input.shape()));
+
+    const int64 dim0 = input.shape().dim_size(0);
+    const int64 dim1 = input.shape().num_elements() / dim0;
+
+    Tensor* output;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(
+                       0, TensorShape{dim0, kFingerprintSize}, &output));
+
+    if (input.dtype() == DT_STRING) {
+      if (dim1 > 1) {
+        Tensor temp;
+        OP_REQUIRES_OK(context, context->allocate_temp(
+                                    DT_UINT8,
+                                    TensorShape{input.shape().num_elements(),
+                                                kFingerprintSize},
+                                    &temp));
+        // `temp` is a matrix of shape {input.num_elements, fingerprint_size},
+        // and each row contains the fingerprint value of corresponding string.
+        // To compute fingerprints of multiple strings, this op fingerprints the
+        // buffer containing the string fingerprints.
+        FarmhashFingerprint64(input.flat<string>(), temp.tensor<uint8, 2>());
+        FarmhashFingerprint64(static_cast<const Tensor&>(temp).shaped<uint8, 2>(
+                                  {dim0, dim1 * kFingerprintSize}),
+                              output->matrix<uint8>());
+      } else {
+        // In case dim1 == 1, each string computes into its own fingerprint
+        // value. There is no need to fingerprint twice.
+        FarmhashFingerprint64(input.flat<string>(), output->matrix<uint8>());
+      }
+    } else {
+      auto data = input.bit_casted_shaped<uint8, 2>(
+          {dim0, dim1 * DataTypeSize(input.dtype())});
+      FarmhashFingerprint64(data, output->matrix<uint8>());
+    }
+  }
+
+ private:
+  static constexpr int kFingerprintSize = sizeof(uint64);
+};
+
+REGISTER_KERNEL_BUILDER(Name("Fingerprint").Device(tensorflow::DEVICE_CPU),
+                        FingerprintOp);
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fingerprint_op_test.cc b/tensorflow/core/kernels/fingerprint_op_test.cc
new file mode 100644
index 00000000000..febfafb4db3
--- /dev/null
+++ b/tensorflow/core/kernels/fingerprint_op_test.cc
@@ -0,0 +1,242 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <numeric>
+#include <vector>
+
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+Status MakeNodeDef(DataType dtype, NodeDef* node_def) {
+  return NodeDefBuilder("fingerprint", "Fingerprint")
+      .Input(FakeInput(dtype))
+      .Input(FakeInput(DT_STRING))
+      .Finalize(node_def);
+}
+
+class FingerprintOpTest : public OpsTestBase {
+ protected:
+  Status MakeFingerprintOp(Tensor* tensor) {
+    return MakeFingerprintOp(tensor, "farmhash64");
+  }
+
+  Status MakeFingerprintOp(Tensor* data, const string& method) {
+    TF_RETURN_IF_ERROR(MakeNodeDef(data->dtype(), node_def()));
+    TF_RETURN_IF_ERROR(InitOp());
+
+    inputs_.clear();
+    inputs_.push_back(data);
+
+    method_ = Tensor(DT_STRING, TensorShape{});
+    method_.scalar<string>()() = method;
+    inputs_.push_back(&method_);
+    return Status::OK();
+  }
+
+  Tensor batch_dims_;
+  Tensor method_;
+};
+
+// This test detects changes in fingerprint method.
+TEST_F(FingerprintOpTest, GoldenValue) {
+  Tensor tensor(DT_UINT8, {1, 3, 4, 5, 6, 7});
+  auto buffer = tensor.flat<uint8>();
+  std::iota(buffer.data(), buffer.data() + buffer.size(),
+            static_cast<uint8>(47));
+
+  TF_ASSERT_OK(MakeFingerprintOp(&tensor));
+  TF_ASSERT_OK(RunOpKernel());
+  EXPECT_EQ(GetOutput(0)->shape(), (TensorShape{1, 8}));
+  EXPECT_EQ(GetOutput(0)->tensor_data(), "\x2d\x90\xdf\x03\x79\x36\x3c\x43");
+}
+
+// String types have a different compute path. This test detects changes in this
+// special-case handling.
+TEST_F(FingerprintOpTest, StringGoldenValue) {
+  Tensor data(DT_STRING, {1, 2, 2});
+  auto buffer = data.flat<string>();
+  buffer(0).resize(10);
+  buffer(1).resize(7);
+  buffer(2).resize(0);
+  buffer(3).resize(19);
+  std::iota(buffer(0).begin(), buffer(0).end(), 0);
+  std::iota(buffer(1).begin(), buffer(1).end(), 7);
+  std::iota(buffer(2).begin(), buffer(2).end(), 71);
+  std::iota(buffer(3).begin(), buffer(3).end(), 41);
+
+  TF_ASSERT_OK(MakeFingerprintOp(&data));
+  TF_ASSERT_OK(RunOpKernel());
+  ASSERT_EQ(GetOutput(0)->shape(), (TensorShape{1, 8}));
+  EXPECT_EQ(GetOutput(0)->tensor_data(), "\x92\x43\x28\x52\xa3\x7c\x48\x18");
+
+  // When each batch item has exactly one string, Fingerprint op avoids
+  // double-fingerprint. Adding a test to detect any change in this logic.
+  ASSERT_TRUE(data.CopyFrom(data, TensorShape{4}));
+  TF_ASSERT_OK(MakeFingerprintOp(&data));
+  TF_ASSERT_OK(RunOpKernel());
+  ASSERT_EQ(GetOutput(0)->shape(), (TensorShape{4, 8}));
+  EXPECT_EQ(GetOutput(0)->tensor_data(),
+            "\xea\xff\xd6\xb2\xb2\x4d\x70\x9b"
+            "\x6e\x9d\xed\x21\xc6\x4a\x61\x52"
+            "\x4f\x40\x90\x2f\x3b\x6a\xe1\x9a"
+            "\x0d\x9b\x7f\x63\x23\x14\x1c\xb8");
+}
+
+TEST_F(FingerprintOpTest, Collision) {
+  const TensorShape shape = {1, 2, 4, 6};
+  for (DataType dtype : kRealNumberTypes) {
+    const int64 size = shape.num_elements() * DataTypeSize(dtype);
+
+    Tensor tensor(dtype, shape);
+    auto buffer = tensor.bit_casted_shaped<uint8, 1>({size});
+    buffer.setRandom();
+
+    TF_ASSERT_OK(MakeFingerprintOp(&tensor));
+    TF_ASSERT_OK(RunOpKernel());
+    const Tensor fingerprint0 = *GetOutput(0);
+
+    // Alter a byte value in the buffer.
+    const int offset = buffer(0) % buffer.size();
+    buffer(offset) = ~buffer(offset);
+
+    TF_ASSERT_OK(MakeFingerprintOp(&tensor));
+    TF_ASSERT_OK(RunOpKernel());
+    const Tensor fingerprint1 = *GetOutput(0);
+
+    EXPECT_NE(fingerprint0.tensor_data(), fingerprint1.tensor_data());
+  }
+}
+
+TEST_F(FingerprintOpTest, CollisionString) {
+  constexpr int64 size = 256;
+
+  Tensor tensor(DT_STRING, {1});
+  auto& input = tensor.vec<string>()(0);
+  input.resize(size);
+
+  TTypes<uint8>::UnalignedFlat buffer(reinterpret_cast<uint8*>(&*input.begin()),
+                                      input.size());
+  buffer.setRandom();
+
+  TF_ASSERT_OK(MakeFingerprintOp(&tensor));
+  TF_ASSERT_OK(RunOpKernel());
+  const Tensor fingerprint0 = *GetOutput(0);
+
+  // Alter a byte value in the buffer.
+  const int offset = buffer(0) % buffer.size();
+  buffer(offset) = ~buffer(offset);
+
+  TF_ASSERT_OK(MakeFingerprintOp(&tensor));
+  TF_ASSERT_OK(RunOpKernel());
+  const Tensor fingerprint1 = *GetOutput(0);
+
+  EXPECT_NE(fingerprint0.tensor_data(), fingerprint1.tensor_data());
+}
+
+TEST_F(FingerprintOpTest, CompareBytesAndString) {
+  Tensor pods_tensor(DT_FLOAT, {4, 64});
+  Tensor strings_tensor(DT_STRING, {4});
+
+  auto pods = pods_tensor.matrix<float>();
+  pods.setRandom();
+
+  auto strings = strings_tensor.vec<string>();
+  for (int64 i = 0; i < strings.size(); ++i) {
+    strings(i).assign(reinterpret_cast<const char*>(&pods(i, 0)),
+                      pods.dimension(1) * sizeof(pods(i, 0)));
+  }
+
+  TF_ASSERT_OK(MakeFingerprintOp(&pods_tensor));
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor pods_fingerprints = *GetOutput(0);
+
+  TF_ASSERT_OK(MakeFingerprintOp(&strings_tensor));
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor strings_fingerprints = *GetOutput(0);
+
+  EXPECT_EQ(pods_fingerprints.tensor_data(),
+            strings_fingerprints.tensor_data());
+}
+
+TEST_F(FingerprintOpTest, SupportedMethods) {
+  Tensor tensor(DT_STRING, TensorShape{1});
+  TF_ASSERT_OK(MakeFingerprintOp(&tensor, "unsupported_method"));
+
+  const Status status = RunOpKernel();
+  EXPECT_FALSE(status.ok());
+  EXPECT_NE(status.error_message().find("unsupported_method"), string::npos);
+}
+
+TEST_F(FingerprintOpTest, SupportedTypes) {
+  Tensor input(DT_RESOURCE, TensorShape{1});
+  EXPECT_FALSE(MakeFingerprintOp(&input).ok());
+}
+
+TEST(FingerprintOpShapeFnTest, MethodKnownStatically) {
+  ShapeInferenceTestOp op("Fingerprint");
+
+  Tensor method(DT_STRING, TensorShape{});
+  method.scalar<string>()() = "farmhash64";
+  op.input_tensors.assign({nullptr, &method});
+
+  TF_ASSERT_OK(MakeNodeDef(DT_UINT8, &op.node_def));
+  INFER_OK(op, "?;?", "[?,8]");
+  INFER_ERROR("must be at least rank 1", op, "[];?");
+  INFER_OK(op, "[?];?", "[d0_0,8]");
+  INFER_OK(op, "[1,?];?", "[d0_0,8]");
+  INFER_OK(op, "[?,2,3];?", "[d0_0,8]");
+}
+
+TEST(FingerprintOpShapeFnTest, MethodUnknownStatically) {
+  ShapeInferenceTestOp op("Fingerprint");
+
+  TF_ASSERT_OK(MakeNodeDef(DT_FLOAT, &op.node_def));
+  INFER_OK(op, "?;?", "[?,?]");
+  INFER_ERROR("must be at least rank 1", op, "[];?");
+  INFER_OK(op, "[?];?", "[d0_0,?]");
+  INFER_OK(op, "[1,?];?", "[d0_0,?]");
+  INFER_OK(op, "[?,2,3];?", "[d0_0,?]");
+}
+
+TEST(FingerprintOpShapeFnTest, InvalidMethod) {
+  ShapeInferenceTestOp op("Fingerprint");
+
+  // When `method` shape is known statically.
+  INFER_ERROR("must be rank 0", op, "[1];[1]");
+
+  // When `method` shape is unknown statically.
+  Tensor method(DT_STRING, TensorShape{1});
+  method.vec<string>()(0) = "farmhash64";
+  op.input_tensors.assign({nullptr, &method});
+  INFER_ERROR("must be rank 0", op, "?;?");
+
+  method = Tensor(DT_STRING, TensorShape{});
+  method.scalar<string>()() = "unsupported_method";
+  op.input_tensors.assign({nullptr, &method});
+  INFER_ERROR("unsupported_method", op, "?;?");
+}
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index d3f9e68f694..37f2bd309b5 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -250,7 +250,6 @@ class SymbolicGradientOp : public AsyncOpKernel {
         ctx, lib->Instantiate(kGradientOp, AttrSlice(def()), &handle), done);
 
     FunctionLibraryRuntime::Options opts;
-    opts.step_id = ctx->step_id();
     opts.rendezvous = ctx->rendezvous();
     opts.cancellation_manager = ctx->cancellation_manager();
     opts.runner = ctx->runner();
@@ -352,7 +351,6 @@ void RemoteCallOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
   OP_REQUIRES_OK_ASYNC(ctx, ctx->input_list("args", &arguments), done);
 
   FunctionLibraryRuntime::Options opts;
-  opts.step_id = ctx->step_id();
   opts.runner = ctx->runner();
   opts.source_device = source_device;
   if (opts.source_device != target_device) {
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index 246a6ce04d9..8792387f99d 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -114,7 +114,6 @@ Status SetOutputs(const OpKernel* kernel, OpKernelContext* ctx,
 
 void SetRunOptions(OpKernelContext* ctx, FunctionLibraryRuntime::Options* opts,
                    bool always_collect_stats) {
-  opts->step_id = ctx->step_id();
   opts->rendezvous = ctx->rendezvous();
   opts->cancellation_manager = ctx->cancellation_manager();
   if (always_collect_stats) {
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index 48b339508b5..40a58defe72 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -253,13 +253,22 @@ struct FusedBatchNorm<GPUDevice, T, U> {
     const int64 channels = GetTensorDim(x, tensor_format, 'C');
     const int64 height = GetTensorDim(x, tensor_format, 'H');
     const int64 width = GetTensorDim(x, tensor_format, 'W');
+
+    // If input tensor is in NHWC format, and we are running in inference mode,
+    // there is no need to convert to NCHW format, performance is the same.
+    // However in training mode, performance in NCHW format is much better.
+    TensorFormat compute_format = !is_training && tensor_format == FORMAT_NHWC
+                                      ? FORMAT_NHWC
+                                      : FORMAT_NCHW;
+
     VLOG(2) << "FusedBatchNorm:"
             << " batch_size: " << batch_size << " channels: " << channels
             << " height: " << height << " width:" << width
             << " x shape: " << x.shape().DebugString()
             << " scale shape: " << scale.shape().DebugString()
             << " offset shape: " << offset.shape().DebugString()
-            << " tensor format: " << tensor_format;
+            << " tensor format: " << ToString(tensor_format)
+            << " compute format: " << ToString(compute_format);
 
     // If input is empty, return NaN mean/variance
     if (x.shape().num_elements() == 0) {
@@ -274,12 +283,12 @@ struct FusedBatchNorm<GPUDevice, T, U> {
     Tensor y_transformed;
     se::DeviceMemory<T> y_ptr;
 
-    if (tensor_format == FORMAT_NCHW) {
+    if (tensor_format == compute_format) {
       y_ptr = StreamExecutorUtil::AsDeviceMemory<T>(*y);
-    } else if (tensor_format == FORMAT_NHWC) {
+    } else if (tensor_format == FORMAT_NHWC && compute_format == FORMAT_NCHW) {
       OP_REQUIRES_OK(context, context->allocate_temp(
                                   DataTypeToEnum<T>::value,
-                                  ShapeFromFormat(FORMAT_NCHW, batch_size,
+                                  ShapeFromFormat(compute_format, batch_size,
                                                   height, width, channels),
                                   &x_transformed));
       functor::NHWCToNCHW<GPUDevice, T, 4>()(
@@ -290,22 +299,27 @@ struct FusedBatchNorm<GPUDevice, T, U> {
 
       OP_REQUIRES_OK(context, context->allocate_temp(
                                   DataTypeToEnum<T>::value,
-                                  ShapeFromFormat(FORMAT_NCHW, batch_size,
+                                  ShapeFromFormat(compute_format, batch_size,
                                                   height, width, channels),
                                   &y_transformed));
       y_ptr = StreamExecutorUtil::AsDeviceMemory<T>(y_transformed);
     } else {
-      context->SetStatus(
-          errors::Internal("Unsupported tensor format: ", tensor_format));
+      context->SetStatus(errors::Internal(
+          "Unsupported tensor format: ", ToString(tensor_format),
+          " and compute format: ", ToString(compute_format)));
       return;
     }
 
+    const se::dnn::DataLayout data_layout =
+        compute_format == FORMAT_NHWC ? se::dnn::DataLayout::kBatchYXDepth
+                                      : se::dnn::DataLayout::kBatchDepthYX;
+
     se::dnn::BatchDescriptor x_desc;
     x_desc.set_count(batch_size)
         .set_feature_map_count(channels)
         .set_height(height)
         .set_width(width)
-        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+        .set_layout(data_layout);
 
     se::dnn::BatchDescriptor scale_offset_desc;
     scale_offset_desc.set_count(1)
@@ -371,7 +385,8 @@ struct FusedBatchNorm<GPUDevice, T, U> {
           errors::Internal("cuDNN launch failure : input shape (",
                            x.shape().DebugString(), ")"));
     }
-    if (tensor_format == FORMAT_NHWC) {
+
+    if (tensor_format == FORMAT_NHWC && compute_format == FORMAT_NCHW) {
       functor::NCHWToNHWC<GPUDevice, T, 4>()(
           context->eigen_device<GPUDevice>(),
           const_cast<const Tensor&>(y_transformed).tensor<T, 4>(),
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cu.cc b/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
index 5a6587ef4ea..261cb9d1b31 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #define EIGEN_USE_GPU
-#include "cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cuda.h"
 #include "tensorflow/core/kernels/fused_batch_norm_op.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
@@ -37,7 +37,7 @@ template <class T>
 void VarianceToInvVariance<T>::operator()(const Eigen::GpuDevice& d,
                                           const T* variance, double epsilon,
                                           int channels, T* inv_variance) {
-  CudaLaunchConfig config = GetCudaLaunchConfig(channels, d);
+  GpuLaunchConfig config = GetCudaLaunchConfig(channels, d);
   TF_CHECK_OK(CudaLaunchKernel(VarianceToInvVarianceKernel<T>,
                                config.block_count, config.thread_per_block, 0,
                                d.stream(), config.virtual_thread_count,
@@ -60,7 +60,7 @@ template <class T>
 void InvVarianceToVariance<T>::operator()(const Eigen::GpuDevice& d,
                                           double epsilon, int sample_size,
                                           int channels, T* variance) {
-  CudaLaunchConfig config = GetCudaLaunchConfig(channels, d);
+  GpuLaunchConfig config = GetCudaLaunchConfig(channels, d);
   TF_CHECK_OK(CudaLaunchKernel(InvVarianceToVarianceKernel<T>,
                                config.block_count, config.thread_per_block, 0,
                                d.stream(), config.virtual_thread_count, epsilon,
diff --git a/tensorflow/core/kernels/fused_batch_norm_op_test.cc b/tensorflow/core/kernels/fused_batch_norm_op_test.cc
index a3f760b746a..1b348a600b6 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op_test.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include <vector>
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/node_def_builder.h"
@@ -21,10 +23,12 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 
 namespace tensorflow {
 class FusedBatchNormOpTest : public OpsTestBase {};
@@ -124,4 +128,82 @@ TEST_F(FusedBatchNormGradOpTest, Simple) {
   test::FillValues<float>(&expected_offset, {27, 27});
   test::ExpectTensorNear<float>(expected_offset, *GetOutput(2), 0.01);
 }
+
+//----------------------------------------------------------------------------//
+// Performance benchmarks are below.                                          //
+//----------------------------------------------------------------------------//
+
+using fp32 = float;
+using fp16 = Eigen::half;
+
+template <typename T>
+static Graph* FusedBatchNormInference(int n, int h, int w, int c,
+                                      bool is_training,
+                                      TensorFormat data_format) {
+  Graph* g = new Graph(OpRegistry::Global());
+
+  DataType dtype = DataTypeToEnum<T>::value;
+  Tensor x_t(dtype, data_format == FORMAT_NHWC ? TensorShape({n, h, w, c})
+                                               : TensorShape({n, c, h, w}));
+  x_t.flat<T>().setRandom();
+
+  Tensor other_t(DT_FLOAT, TensorShape({c}));
+  other_t.flat<float>().setRandom();
+
+  Tensor empty_t(DT_FLOAT, TensorShape({0}));
+
+  Node* x = test::graph::Constant(g, x_t, "x");
+  Node* other = test::graph::Constant(g, other_t, "other");
+  Node* empty = test::graph::Constant(g, empty_t, "empty");
+
+  Node* fused_batch_norm;
+  TF_CHECK_OK(NodeBuilder(g->NewName("fused_batch_norm"), "FusedBatchNormV2")
+                  .Input(x)
+                  .Input(other)                        // scale
+                  .Input(other)                        // offset
+                  .Input(is_training ? empty : other)  // mean
+                  .Input(is_training ? empty : other)  // variance
+                  .Attr("T", dtype)
+                  .Attr("U", DT_FLOAT)
+                  .Attr("epsilon", 0.001)
+                  .Attr("is_training", is_training)
+                  .Attr("data_format", ToString(data_format))
+                  .Finalize(g, &fused_batch_norm));
+
+  return g;
+}
+
+#define BM_NAME(N, H, W, C, T, IT, FORMAT, DEVICE) \
+  BM_FusedBatchNorm##_##N##_##H##_##W##_##C##_##IT##_##FORMAT##_##T##_##DEVICE
+
+#define BM_FusedBatchNorm(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)          \
+  static void BM_NAME(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)(int iters) { \
+    testing::UseRealTime();                                                    \
+    testing::ItemsProcessed(static_cast<int64>(iters) * N * H * W * C);        \
+    test::Benchmark(#DEVICE, FusedBatchNormInference<T>(                       \
+                                 N, H, W, C, IS_TRAINING, FORMAT_##FORMAT))    \
+        .Run(iters);                                                           \
+  }                                                                            \
+  BENCHMARK(BM_NAME(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE));
+
+BM_FusedBatchNorm(64, 14, 14, 256, fp32, false, NHWC, cpu);
+BM_FusedBatchNorm(64, 14, 14, 256, fp16, false, NHWC, cpu);
+
+BM_FusedBatchNorm(64, 14, 14, 256, fp32, true, NHWC, cpu);
+BM_FusedBatchNorm(64, 14, 14, 256, fp16, true, NHWC, cpu);
+
+#ifdef GOOGLE_CUDA
+BM_FusedBatchNorm(64, 14, 14, 256, fp32, false, NHWC, gpu);
+BM_FusedBatchNorm(64, 14, 14, 256, fp16, false, NHWC, gpu);
+
+BM_FusedBatchNorm(64, 14, 14, 256, fp32, false, NCHW, gpu);
+BM_FusedBatchNorm(64, 14, 14, 256, fp16, false, NCHW, gpu);
+
+BM_FusedBatchNorm(64, 14, 14, 256, fp32, true, NHWC, gpu);
+BM_FusedBatchNorm(64, 14, 14, 256, fp16, true, NHWC, gpu);
+
+BM_FusedBatchNorm(64, 14, 14, 256, fp32, true, NCHW, gpu);
+BM_FusedBatchNorm(64, 14, 14, 256, fp16, true, NCHW, gpu);
+#endif  // GOOGLE_CUDA
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/1b0384bc2d5ac42b7425ac51c374f60c b/tensorflow/core/kernels/fuzzing/corpus/decode_png/1b0384bc2d5ac42b7425ac51c374f60c
new file mode 100644
index 00000000000..00930411573
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/1b0384bc2d5ac42b7425ac51c374f60c differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/38bd2bd767d0c4ddd531b3893080b952 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/38bd2bd767d0c4ddd531b3893080b952
new file mode 100644
index 00000000000..07e0bbeab40
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/38bd2bd767d0c4ddd531b3893080b952 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/41438a3c1c77c64a2f0840a2427f8834 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/41438a3c1c77c64a2f0840a2427f8834
new file mode 100644
index 00000000000..f2da9c416b2
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_png/41438a3c1c77c64a2f0840a2427f8834
@@ -0,0 +1 @@
+�+�����������������������
\ No newline at end of file
diff --git a/tensorflow/core/kernels/gather_functor_gpu.cu.h b/tensorflow/core/kernels/gather_functor_gpu.cu.h
index e9a928e4015..2db44621c91 100644
--- a/tensorflow/core/kernels/gather_functor_gpu.cu.h
+++ b/tensorflow/core/kernels/gather_functor_gpu.cu.h
@@ -90,7 +90,7 @@ struct GatherFunctor<GPUDevice, T, Index> {
     const int64 indices_size = indices.size();
     const int64 slice_size = params.dimension(2);
 
-    CudaLaunchConfig config = GetCudaLaunchConfig(out_size, d);
+    GpuLaunchConfig config = GetCudaLaunchConfig(out_size, d);
     if (is_axis_zero) {
       TF_CHECK_OK(CudaLaunchKernel(
           GatherOpKernel<T, Index, true>, config.block_count,
diff --git a/tensorflow/core/kernels/gather_nd_op.cc b/tensorflow/core/kernels/gather_nd_op.cc
index b5b6f14bcda..0b82b72ccc3 100644
--- a/tensorflow/core/kernels/gather_nd_op.cc
+++ b/tensorflow/core/kernels/gather_nd_op.cc
@@ -71,6 +71,7 @@ class GatherNdOp : public OpKernel {
 //
 // Same for the GPU kernel.
 TF_CALL_ALL_TYPES(REGISTER_GATHER_ND_CPU);
+TF_CALL_QUANTIZED_TYPES(REGISTER_GATHER_ND_CPU);
 
 #undef REGISTER_GATHER_ND_CPU
 
diff --git a/tensorflow/core/kernels/gather_nd_op_cpu_impl.h b/tensorflow/core/kernels/gather_nd_op_cpu_impl.h
index cf9817dc306..c3d2f701398 100644
--- a/tensorflow/core/kernels/gather_nd_op_cpu_impl.h
+++ b/tensorflow/core/kernels/gather_nd_op_cpu_impl.h
@@ -152,6 +152,7 @@ struct GatherNdSlice<CPUDevice, T, Index, IXDIM> {
   REGISTER_GATHER_ND_FULL(type, int64)
 
 TF_CALL_ALL_TYPES(REGISTER_GATHER_ND_CPU);
+TF_CALL_QUANTIZED_TYPES(REGISTER_GATHER_ND_CPU);
 
 }  // namespace functor
 
diff --git a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
index 0603ff0a4c7..1274e3f75c9 100644
--- a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
@@ -84,7 +84,7 @@ struct GatherNdSlice<GPUDevice, T, Index, IXDIM> {
       batch_indices[i - 1] = Tparams.dimension(i - 1);
       batch_strides[i - 1] = batch_strides[i] * Tparams.dimension(i);
     }
-    CudaLaunchConfig config = GetCudaLaunchConfig(out_size, d);
+    GpuLaunchConfig config = GetCudaLaunchConfig(out_size, d);
 
     TF_CHECK_OK(CudaLaunchKernel(GatherSliceOpKernel<T, Index, IXDIM>,
                                  config.block_count, config.thread_per_block, 0,
diff --git a/tensorflow/core/kernels/gather_nd_op_test.cc b/tensorflow/core/kernels/gather_nd_op_test.cc
index 9f8658ef0e8..b0b5c958b5a 100644
--- a/tensorflow/core/kernels/gather_nd_op_test.cc
+++ b/tensorflow/core/kernels/gather_nd_op_test.cc
@@ -57,9 +57,9 @@ namespace {
 
 class GatherNdOpTest : public OpsTestBase {
  protected:
-  void MakeOp(DataType index_type) {
+  void MakeOp(DataType param_type, DataType index_type) {
     TF_ASSERT_OK(NodeDefBuilder("myop", "GatherNd")
-                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(param_type))
                      .Input(FakeInput(index_type))
                      .Finalize(node_def()));
     TF_ASSERT_OK(InitOp());
@@ -67,7 +67,7 @@ class GatherNdOpTest : public OpsTestBase {
 };
 
 TEST_F(GatherNdOpTest, Simple) {
-  MakeOp(DT_INT32);
+  MakeOp(DT_FLOAT, DT_INT32);
 
   // Feed and run
   AddInputFromArray<float>(TensorShape({5}), {0, 1, 2, 8, 4});
@@ -80,6 +80,32 @@ TEST_F(GatherNdOpTest, Simple) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
+TEST_F(GatherNdOpTest, Quantized_UINT8) {
+  MakeOp(DT_QUINT8, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<quint8>(TensorShape({5}), {0, 1, 2, 8, 4});
+  AddInputFromArray<int32>(TensorShape({2, 1}), {3, 4});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_QUINT8, TensorShape({2}));
+  test::FillValues<quint8>(&expected, {8, 4});
+  test::ExpectTensorEqual<quint8>(expected, *GetOutput(0));
+}
+
+TEST_F(GatherNdOpTest, Quantized_INT8) {
+  MakeOp(DT_QINT8, DT_INT32);
+
+  AddInputFromArray<qint8>(TensorShape({5}), {0, 1, 2, 8, 4});
+  AddInputFromArray<int32>(TensorShape({2, 1}), {3, 4});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_QINT8, TensorShape({2}));
+  test::FillValues<qint8>(&expected, {8, 4});
+  test::ExpectTensorEqual<qint8>(expected, *GetOutput(0));
+}
+
 constexpr int kLookups = 2000;
 
 template <typename Index>
diff --git a/tensorflow/core/kernels/gpu_utils.cc b/tensorflow/core/kernels/gpu_utils.cc
index 305dac409d9..a6a13345c71 100644
--- a/tensorflow/core/kernels/gpu_utils.cc
+++ b/tensorflow/core/kernels/gpu_utils.cc
@@ -70,6 +70,8 @@ void LogConvAutotuneResults(se::dnn::ConvolutionKind kind,
     *instr.mutable_output() = output_desc.ToProto(element_type);
     *instr.mutable_conv_desc() = conv_desc.ToProto();
     log.mutable_instr()->PackFrom(std::move(instr));
+    instr.set_conv_scale(1);
+    instr.set_side_value_scale(0);
   }
   *log.mutable_cudnn_version() = GetCudnnVersion(stream_exec);
   *log.mutable_compute_capability() = GetComputeCapability(stream_exec);
@@ -80,9 +82,8 @@ void LogConvAutotuneResults(se::dnn::ConvolutionKind kind,
   Logger::Singleton()->LogProto(log);
 }
 
-void LogFusedConvAutotuneResults(
-    se::dnn::ConvolutionKind kind, se::dnn::DataType element_type,
-    const se::dnn::BatchDescriptor& input_desc,
+void LogFusedConvForwardAutotuneResults(
+    se::dnn::DataType element_type, const se::dnn::BatchDescriptor& input_desc,
     const se::dnn::FilterDescriptor& filter_desc,
     const se::dnn::BatchDescriptor& output_desc,
     const se::dnn::ConvolutionDescriptor& conv_desc, double conv_scale,
@@ -91,7 +92,7 @@ void LogFusedConvAutotuneResults(
   AutotuningLog log;
   {
     ConvolutionProto instr;
-    instr.set_kind(kind);
+    instr.set_kind(se::dnn::ConvolutionKind::FORWARD_BIAS_ACTIVATION);
     *instr.mutable_input() = input_desc.ToProto(element_type);
     *instr.mutable_filter() = filter_desc.ToProto(element_type);
     *instr.mutable_output() = output_desc.ToProto(element_type);
diff --git a/tensorflow/core/kernels/gpu_utils.h b/tensorflow/core/kernels/gpu_utils.h
index d412e61e91e..14cd639446d 100644
--- a/tensorflow/core/kernels/gpu_utils.h
+++ b/tensorflow/core/kernels/gpu_utils.h
@@ -198,9 +198,8 @@ void LogConvAutotuneResults(se::dnn::ConvolutionKind kind,
                             absl::Span<const AutotuneResult> results);
 
 // Logs fused convolution results to customized back-storage.
-void LogFusedConvAutotuneResults(
-    se::dnn::ConvolutionKind kind, se::dnn::DataType element_type,
-    const se::dnn::BatchDescriptor& input_desc,
+void LogFusedConvForwardAutotuneResults(
+    se::dnn::DataType element_type, const se::dnn::BatchDescriptor& input_desc,
     const se::dnn::FilterDescriptor& filter_desc,
     const se::dnn::BatchDescriptor& output_desc,
     const se::dnn::ConvolutionDescriptor& conv_desc, double conv_scale,
diff --git a/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc b/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
index 20672c5be2b..cdb42645ee2 100644
--- a/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
@@ -43,7 +43,7 @@ template <typename T>
 Status DoParallelConcatUpdate(const Device& d, const Tensor& value, int32 loc,
                               Tensor* output) {
   const int64 nelem = value.NumElements();
-  CudaLaunchConfig cfg = GetCudaLaunchConfig(nelem, d);
+  GpuLaunchConfig cfg = GetCudaLaunchConfig(nelem, d);
   auto Toutput = output->flat_outer_dims<T>();
   const int64 nrows = Toutput.dimension(0);
   const int64 ncols = Toutput.dimension(1);
@@ -106,7 +106,7 @@ template <typename T>
 void DoInplaceOp(const Device& d, InplaceOpType op, const Tensor& i,
                  const Tensor& v, Tensor* y) {
   const int64 nelem = v.NumElements();
-  CudaLaunchConfig cfg = GetCudaLaunchConfig(nelem, d);
+  GpuLaunchConfig cfg = GetCudaLaunchConfig(nelem, d);
   auto Ty = y->flat_outer_dims<T>();
   const int64 nrows = Ty.dimension(0);
   const int64 ncols = Ty.dimension(1);
@@ -141,7 +141,7 @@ template <bool>
 void DoInplaceOp(const Device& d, InplaceOpType op, const Tensor& i,
                  const Tensor& v, Tensor* y) {
   const int64 nelem = v.NumElements();
-  CudaLaunchConfig cfg = GetCudaLaunchConfig(nelem, d);
+  GpuLaunchConfig cfg = GetCudaLaunchConfig(nelem, d);
   auto Ty = y->flat_outer_dims<bool>();
   const int64 nrows = Ty.dimension(0);
   const int64 ncols = Ty.dimension(1);
diff --git a/tensorflow/core/kernels/logging_ops.cc b/tensorflow/core/kernels/logging_ops.cc
index 7137bbf4983..c0ec46aacb4 100644
--- a/tensorflow/core/kernels/logging_ops.cc
+++ b/tensorflow/core/kernels/logging_ops.cc
@@ -13,16 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/logging_ops.h"
-
 #include <iostream>
 
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_split.h"
+#include "tensorflow/core/framework/logging.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 
@@ -51,22 +48,6 @@ Status AppendStringToFile(const std::string& fname, StringPiece data,
 
 }  // namespace
 
-namespace logging {
-
-typedef std::vector<void (*)(const char*)> Listeners;
-
-Listeners* GetListeners() {
-  static Listeners* listeners = new Listeners;
-  return listeners;
-}
-
-bool RegisterListener(void (*listener)(const char*)) {
-  GetListeners()->push_back(listener);
-  return true;
-}
-
-}  // end namespace logging
-
 class AssertOp : public OpKernel {
  public:
   explicit AssertOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -180,12 +161,12 @@ class PrintV2Op : public OpKernel {
                      AppendStringToFile(file_path_, ended_msg, ctx->env()));
       return;
     }
-    auto listeners = logging::GetListeners();
-    if (!listeners->empty()) {
-      for (auto& listener : *listeners) {
-        listener(ended_msg.c_str());
-      }
-    } else if (output_stream_ == "stdout") {
+
+    if (logging::LogToListeners(ended_msg, "")) {
+      return;
+    }
+
+    if (output_stream_ == "stdout") {
       std::cout << ended_msg << std::flush;
     } else if (output_stream_ == "stderr") {
       std::cerr << ended_msg << std::flush;
diff --git a/tensorflow/core/kernels/lrn_op.cc b/tensorflow/core/kernels/lrn_op.cc
index ba30432e21a..a5757e433d0 100644
--- a/tensorflow/core/kernels/lrn_op.cc
+++ b/tensorflow/core/kernels/lrn_op.cc
@@ -35,7 +35,7 @@ limitations under the License.
 #endif
 
 #if GOOGLE_CUDA
-#include "cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cuda.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/lu_op_gpu.cu.cc b/tensorflow/core/kernels/lu_op_gpu.cu.cc
index ab4e528cf20..6d94d7f3f64 100644
--- a/tensorflow/core/kernels/lu_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/lu_op_gpu.cu.cc
@@ -61,7 +61,7 @@ __device__ void ComputePermutationFromTranspositions(
 // transpositions.
 template <typename Scalar>
 __global__ void ComputePermutationFromTranspositionsKernel(
-    CudaLaunchConfig config, const int64 num_rows, const int* all_pivots,
+    GpuLaunchConfig config, const int64 num_rows, const int* all_pivots,
     Scalar* all_permutation_indices) {
   // We only parallelize over batches here. Performance is not critical,
   // since this cheap O(num_rows) kernel always follows an O(num_rows^3)
@@ -222,7 +222,7 @@ class LuOpGpu : public AsyncOpKernel {
     int* pivots_ptr = pivots.flat<int>().data();
     Tidx* permutation_indices_ptr =
         permutation_indices->template flat<Tidx>().data();
-    CudaLaunchConfig cfgPivots = GetCudaLaunchConfig(batch_size, device);
+    GpuLaunchConfig cfgPivots = GetCudaLaunchConfig(batch_size, device);
     TF_CHECK_OK(CudaLaunchKernel(
         ComputePermutationFromTranspositionsKernel<Tidx>, cfgPivots.block_count,
         cfgPivots.thread_per_block, 0, device.stream(), cfgPivots, num_rows,
diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc
index 941cafe0bcb..a6b8be95b90 100644
--- a/tensorflow/core/kernels/matmul_op.cc
+++ b/tensorflow/core/kernels/matmul_op.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/util/matmul_autotune.h"
 #if GOOGLE_CUDA
-#include "cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cuda.h"
 #include "tensorflow/core/kernels/gpu_utils.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc b/tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc
index ea1b4960772..2195c583130 100644
--- a/tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc
@@ -58,7 +58,7 @@ struct MatrixBandPartFunctor<GPUDevice, Scalar> {
     const int batch_size = input.dimension(0);
     const int m = input.dimension(1);
     const int n = input.dimension(2);
-    CudaLaunchConfig config = GetCudaLaunchConfig(batch_size * m * n, device);
+    GpuLaunchConfig config = GetCudaLaunchConfig(batch_size * m * n, device);
     TF_CHECK_OK(CudaLaunchKernel(MatrixBandPartKernel<Scalar>,
                                  config.block_count, config.thread_per_block, 0,
                                  device.stream(), config.virtual_thread_count,
diff --git a/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc b/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc
index 096da44152e..4ee52f57939 100644
--- a/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc
@@ -71,15 +71,14 @@ struct MatrixSetDiag<GPUDevice, Scalar> {
     CHECK_EQ(diag.dimension(1), minsize);
     if (batch_size == 0 || minsize == 0) return;
     if (input.data() == output.data()) {
-      CudaLaunchConfig config =
-          GetCudaLaunchConfig(batch_size * minsize, device);
+      GpuLaunchConfig config = GetGpuLaunchConfig(batch_size * minsize, device);
       TF_CHECK_OK(CudaLaunchKernel(MatrixSetDiagKernel<Scalar>,
                                    config.block_count, config.thread_per_block,
                                    0, device.stream(),
                                    config.virtual_thread_count, m, n, minsize,
                                    diag.data(), output.data()));
     } else {
-      CudaLaunchConfig config = GetCudaLaunchConfig(batch_size * m * n, device);
+      GpuLaunchConfig config = GetCudaLaunchConfig(batch_size * m * n, device);
       TF_CHECK_OK(CudaLaunchKernel(MatrixCopyInputAndSetDiagKernel<Scalar>,
                                    config.block_count, config.thread_per_block,
                                    0, device.stream(),
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index fcca2f718d8..a3592d8ec3c 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -41,7 +41,7 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 
 #if GOOGLE_CUDA
-#include "cuda/include/cudnn.h"
+#include "third_party/gpus/cudnn/cudnn.h"
 #include "tensorflow/core/kernels/maxpooling_op_gpu.h"
 #include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
index 58f99a1e398..fec6f2ebd85 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@@ -486,7 +486,7 @@ bool MaxPoolGradBackwardNoMask<T>::operator()(
     const Eigen::GpuDevice& d) {
   const int num_kernels = batch * channels * pooled_height * pooled_width;
   if (num_kernels == 0) return true;
-  CudaLaunchConfig config = GetCudaLaunchConfig(num_kernels, d);
+  GpuLaunchConfig config = GetCudaLaunchConfig(num_kernels, d);
 
   if (data_format == FORMAT_NHWC) {
     TF_CHECK_OK(
@@ -513,7 +513,7 @@ bool MaxPoolGradBackwardWithArgmax<T>::operator()(
     T* bottom_diff, const Eigen::GpuDevice& d,
     const bool include_batch_in_index) {
   if (input_size == 0) return true;
-  CudaLaunchConfig config = GetCudaLaunchConfig(output_size, d);
+  GpuLaunchConfig config = GetCudaLaunchConfig(output_size, d);
   TF_CHECK_OK(CudaLaunchKernel(
       MaxPoolGradBackward<T>, config.block_count, config.thread_per_block, 0,
       d.stream(), output_size, top_diff, mask, top_offset, bottom_offset,
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.h b/tensorflow/core/kernels/maxpooling_op_gpu.h
index c18c4891507..5383833b318 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.h
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if !GOOGLE_CUDA
-#error This file must only be included when building with Cuda support
+#if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM
+#error This file must only be included when building with Cuda or ROCm support
 #endif
 
 #ifndef TENSORFLOW_CORE_KERNELS_MAXPOOLING_OP_GPU_H_
diff --git a/tensorflow/core/kernels/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl_avgpooling_op.cc
index 752f6128526..f13cfc1782f 100644
--- a/tensorflow/core/kernels/mkl_avgpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_avgpooling_op.cc
@@ -16,16 +16,14 @@
 #ifdef INTEL_MKL
 #define EIGEN_USE_THREADS
 
+#include "mkldnn.hpp"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
-
-#ifndef INTEL_MKL_ML_ONLY
-#include "mkldnn.hpp"
 using mkldnn::algorithm;
 using mkldnn::engine;
 using mkldnn::error;
@@ -34,402 +32,11 @@ using mkldnn::padding_kind;
 using mkldnn::pooling_backward;
 using mkldnn::pooling_forward;
 using mkldnn::prop_kind;
-#endif
 
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-#ifdef INTEL_MKL_ML_ONLY
-
-template <typename Device, typename T>
-class MklAvgPoolingOp : public OpKernel {
- public:
-  explicit MklAvgPoolingOp(OpKernelConstruction* context) : OpKernel(context) {
-    string data_format;
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
-    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
-
-    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
-    OP_REQUIRES(context, ksize_.size() == 4,
-                errors::InvalidArgument("Sliding window ksize field must "
-                                        "specify 4 dimensions"));
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
-    OP_REQUIRES(context, stride_.size() == 4,
-                errors::InvalidArgument("Sliding window stride field must "
-                                        "specify 4 dimensions"));
-    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
-                errors::Unimplemented("Pooling is not yet supported on the "
-                                      "batch dimension."));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    MklAvgPoolingOpContext mkl_context;
-    const Tensor& tensor_in = MklGetInput(context, 0);
-    GetMklShape(context, 0, &mkl_context.input_shape);
-    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
-
-    if (!input_in_mkl_format)
-      mkl_context.params.in_dim = tensor_in.dims();
-    else
-      mkl_context.params.in_dim = mkl_context.input_shape.GetDimension();
-
-    MklPoolParameters pool_params;
-    if (!input_in_mkl_format) {
-      pool_params.Init(context, ksize_, stride_, padding_, data_format_,
-                       tensor_in.shape());
-    } else {
-      pool_params.Init(context, ksize_, stride_, padding_, data_format_,
-                       &mkl_context.input_shape);
-    }
-
-    // Extract the parameters for the op from the pooling specs
-    ExtractMklOpParams(context, data_format_, pool_params, &mkl_context.params);
-
-    Tensor mkl_tmp_input_buf_tensor_;
-    mkl_context.MklCreateLayoutsAndPrimitives(context,
-                                              &mkl_tmp_input_buf_tensor_);
-    OP_REQUIRES_OK(context, context->status());
-
-    Tensor workspace_tensor;
-    void* workspace_buf;
-    AllocTmpBuffer(context, &workspace_tensor, mkl_context.lt_workspace,
-                   &workspace_buf);
-
-    if (mkl_context.convert_input != nullptr) {
-      if (input_in_mkl_format == false) {
-        CHECK_EQ(
-            dnnConversionExecute_F32(
-                mkl_context.convert_input,
-                static_cast<void*>(const_cast<T*>(tensor_in.flat<T>().data())),
-                mkl_context.input_buf),
-            E_SUCCESS);
-        CHECK_EQ(dnnDelete_F32(mkl_context.convert_input), E_SUCCESS);
-      } else {
-        mkl_context.input_shape.GetConvertedFlatData(
-            mkl_context.lt_prim_input,
-            static_cast<void*>(const_cast<T*>(tensor_in.flat<T>().data())),
-            mkl_context.input_buf);
-      }
-      mkl_context.pooling_res[dnnResourceSrc] = mkl_context.input_buf;
-    } else {
-      mkl_context.pooling_res[dnnResourceSrc] =
-          static_cast<void*>(const_cast<T*>(tensor_in.flat<T>().data()));
-    }
-
-    // Declare output tensor and allocate memory
-    Tensor* output = nullptr;
-    TensorShape tensor_out_shape;
-    MklShape mkl_out_shape;
-    mkl_out_shape.SetMklTensor(true);
-    mkl_out_shape.SetMklLayout(mkl_context.prim_pooling_fwd, dnnResourceDst);
-    mkl_out_shape.SetTfLayout(mkl_context.params.in_dim,
-                              mkl_context.params.out_sizes,
-                              mkl_context.params.out_strides);
-    mkl_out_shape.SetTfDimOrder(mkl_context.params.in_dim, data_format_);
-
-    tensor_out_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
-                                mkl_out_shape.GetMklLayout())) /
-                            sizeof(T));
-
-    AllocateOutputSetMklShape(context, 0, &output, tensor_out_shape,
-                              mkl_out_shape);
-    mkl_context.pooling_res[dnnResourceDst] =
-        static_cast<void*>(output->flat<T>().data());
-
-    mkl_context.pooling_res[dnnResourceWorkspace] = workspace_buf;
-
-    CHECK_EQ(
-        dnnExecute_F32(mkl_context.prim_pooling_fwd, mkl_context.pooling_res),
-        E_SUCCESS);
-
-    mkl_context.MklCleanup();
-  }  // Compute
-
- private:
-  typedef struct {
-    MklPoolingOpParams params;
-    MklShape input_shape;
-    dnnPrimitive_t prim_pooling_fwd = nullptr, convert_input = nullptr;
-    dnnLayout_t lt_user_input = nullptr, lt_prim_input = nullptr,
-                lt_workspace = nullptr;
-    void* input_buf = nullptr;
-    void* pooling_res[dnnResourceNumber];
-
-    void MklCreateLayoutsAndPrimitives(OpKernelContext* context,
-                                       Tensor* mkl_tmp_input_buf_tensor) {
-      bool input_in_mkl_format = input_shape.IsMklTensor();
-
-      if (!input_in_mkl_format) {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_user_input, params.in_dim,
-                                     params.in_sizes, params.in_strides),
-                 E_SUCCESS);
-      } else {
-        lt_user_input = (dnnLayout_t)input_shape.GetCurLayout();
-      }
-
-      dnnAlgorithm_t algorithm = dnnAlgorithmPoolingAvg;
-      dnnPrimitiveAttributes_t primAttr = nullptr;
-
-      // Create DNN primitives
-      CHECK_EQ(dnnPoolingCreateForward_F32(
-                   &prim_pooling_fwd, primAttr, algorithm, lt_user_input,
-                   params.kernel_size, params.kernel_stride, params.in_offset,
-                   dnnBorderZerosAsymm),
-               E_SUCCESS);
-
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
-                   &lt_prim_input, prim_pooling_fwd, dnnResourceSrc),
-               E_SUCCESS);
-      if (!dnnLayoutCompare_F32(lt_user_input, lt_prim_input)) {
-        CHECK_EQ(dnnConversionCreate_F32(&convert_input, lt_user_input,
-                                         lt_prim_input),
-                 E_SUCCESS);
-
-        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_prim_input,
-                       &input_buf);
-      }
-
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_workspace, prim_pooling_fwd,
-                                                dnnResourceWorkspace),
-               E_SUCCESS);
-    }
-
-    void MklCleanup() {
-      bool input_in_mkl_format = input_shape.IsMklTensor();
-      if (!input_in_mkl_format) {
-        CHECK_EQ(dnnLayoutDelete_F32(lt_user_input), E_SUCCESS);
-      }
-
-      CHECK_EQ(dnnDelete_F32(prim_pooling_fwd), E_SUCCESS);
-      CHECK_EQ(dnnLayoutDelete_F32(lt_prim_input), E_SUCCESS);
-    }
-  } MklAvgPoolingOpContext;
-
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
-  Padding padding_;
-  TensorFormat data_format_;
-};
-
-//-----------------------------------------------------------------------------
-
-template <class Device, class T>
-class MklAvgPoolingGradOp : public OpKernel {
- public:
-  explicit MklAvgPoolingGradOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    string data_format;
-
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
-    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
-    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
-    OP_REQUIRES(context, ksize_.size() == 4,
-                errors::InvalidArgument("Sliding window ksize field must "
-                                        "specify 4 dimensions"));
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
-    OP_REQUIRES(context, stride_.size() == 4,
-                errors::InvalidArgument("Sliding window strides field must "
-                                        "specify 4 dimensions"));
-    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
-                errors::Unimplemented("Pooling is not yet supported on the "
-                                      "batch dimension."));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    MklAvgPoolingGradOpContext mkl_context;
-    const Tensor& tensor_in_shape = MklGetInput(context, 0);
-    const Tensor& out_backprop = MklGetInput(context, 1);
-    GetMklShape(context, 1, &mkl_context.out_backprop_shape);
-    bool outbackprop_in_mkl_format =
-        mkl_context.out_backprop_shape.IsMklTensor();
-
-    TensorShape output_shape;
-    auto shape_vec = tensor_in_shape.vec<int32>();
-    for (int64 i = 0; i < tensor_in_shape.NumElements(); ++i) {
-      output_shape.AddDim(shape_vec(i));
-    }
-
-    MklPoolParameters pool_params;
-    pool_params.Init(context, ksize_, stride_, padding_, data_format_,
-                     output_shape);
-
-    if (outbackprop_in_mkl_format == false)
-      mkl_context.params.in_dim = out_backprop.dims();
-    else
-      mkl_context.params.in_dim = mkl_context.out_backprop_shape.GetDimension();
-
-    // Extract the parameters for the op from the pooling specs
-    ExtractMklOpParams(context, data_format_, pool_params, &mkl_context.params);
-
-    // Tensors needed to create temporary buffers
-    Tensor outbackprop_buf_tensor;
-    void* outbackprop_buf;
-    mkl_context.MklCreateLayoutsAndPrimitives(context);
-    OP_REQUIRES_OK(context, context->status());
-
-    // Check if outbackprop layout requires conversion.
-    if (!dnnLayoutCompare_F32(mkl_context.lt_user_outbackprop,
-                              mkl_context.lt_prim_outbackprop)) {
-      CHECK_EQ(dnnConversionCreate_F32(&mkl_context.convert_outbackprop,
-                                       mkl_context.lt_user_outbackprop,
-                                       mkl_context.lt_prim_outbackprop),
-               E_SUCCESS);
-
-      AllocTmpBuffer(context, &outbackprop_buf_tensor,
-                     mkl_context.lt_prim_outbackprop, &outbackprop_buf);
-
-      if (!outbackprop_in_mkl_format) {
-        CHECK_EQ(dnnConversionExecute_F32(mkl_context.convert_outbackprop,
-                                          static_cast<void*>(const_cast<T*>(
-                                              out_backprop.flat<T>().data())),
-                                          outbackprop_buf),
-                 E_SUCCESS);
-        CHECK_EQ(dnnDelete_F32(mkl_context.convert_outbackprop), E_SUCCESS);
-      } else {
-        mkl_context.out_backprop_shape.GetConvertedFlatData(
-            mkl_context.lt_prim_outbackprop,
-            static_cast<void*>(const_cast<T*>(out_backprop.flat<T>().data())),
-            outbackprop_buf);
-      }
-      mkl_context.pooling_res[dnnResourceDiffDst] = outbackprop_buf;
-    } else {
-      mkl_context.pooling_res[dnnResourceDiffDst] =
-          static_cast<void*>(const_cast<T*>(out_backprop.flat<T>().data()));
-    }
-
-    // Handle workspace requirements.
-    Tensor workspace_buf_tensor;
-    void* workspace_buf;
-    AllocTmpBuffer(context, &workspace_buf_tensor, mkl_context.lt_workspace,
-                   &workspace_buf);
-    mkl_context.pooling_res[dnnResourceWorkspace] = workspace_buf;
-
-    // Handle MKL output tensor setup.
-    Tensor* output = nullptr;
-    TensorShape tensor_out_shape;
-    MklShape mkl_out_shape;
-    mkl_out_shape.SetMklTensor(true);
-    mkl_out_shape.SetMklLayout(mkl_context.prim_pooling_bwd,
-                               dnnResourceDiffSrc);
-    mkl_out_shape.SetTfLayout(mkl_context.params.in_dim,
-                              mkl_context.params.in_sizes,
-                              mkl_context.params.in_strides);
-    mkl_out_shape.SetTfDimOrder(mkl_context.params.in_dim, data_format_);
-
-    tensor_out_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
-                                mkl_out_shape.GetMklLayout())) /
-                            sizeof(T));
-
-    AllocateOutputSetMklShape(context, 0, &output, tensor_out_shape,
-                              mkl_out_shape);
-
-    // Set output tensor.
-    mkl_context.pooling_res[dnnResourceDiffSrc] =
-        static_cast<void*>(output->flat<T>().data());
-
-    // Execute primitive.
-    CHECK_EQ(
-        dnnExecute_F32(mkl_context.prim_pooling_bwd, mkl_context.pooling_res),
-        E_SUCCESS);
-
-    mkl_context.MklCleanup();
-  }
-
- private:
-  typedef struct {
-    MklPoolingOpParams params;
-    MklShape out_backprop_shape;
-    dnnPrimitive_t prim_pooling_bwd = nullptr, convert_outbackprop = nullptr;
-    void* pooling_res[dnnResourceNumber];
-    dnnLayout_t lt_user_input = nullptr, lt_user_outbackprop = nullptr,
-                lt_prim_outbackprop = nullptr, lt_workspace = nullptr;
-
-    void MklCreateLayoutsAndPrimitives(OpKernelContext* context) {
-      const Tensor& tensor_in_shape = MklGetInput(context, 0);
-      const Tensor& out_backprop = MklGetInput(context, 1);
-      bool outbackprop_in_mkl_format = out_backprop_shape.IsMklTensor();
-
-      if (!outbackprop_in_mkl_format) {
-        // For avgpooling, tensor_in_shape should have 1 dimension, and 4
-        // elements.
-        OP_REQUIRES(
-            context,
-            tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 4,
-            errors::InvalidArgument("original input shape must be "
-                                    "1-dimensional and 4 elements"));
-
-        // For avgpooling, out_backprop should have 4 dimensions.
-        OP_REQUIRES(
-            context, out_backprop.dims() == 4,
-            errors::InvalidArgument("out_backprop must be 4-dimensional"));
-      } else {
-        // Input in MKL format.
-        // For avgpooling, out_backprop should have 4 dimensions.
-        OP_REQUIRES(
-            context, out_backprop_shape.GetDimension() == 4,
-            errors::InvalidArgument("out_backprop must be 4-dimensional"));
-      }
-
-      // TODO(inteltf): Get outbackprop layout.
-      // Do we need to create layout in every invocation?
-      if (!outbackprop_in_mkl_format) {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_user_outbackprop, params.in_dim,
-                                     params.out_sizes, params.out_strides),
-                 E_SUCCESS);
-      } else {
-        lt_user_outbackprop = (dnnLayout_t)out_backprop_shape.GetCurLayout();
-      }
-
-      // Create the backward primitive
-      // Create DNN user layout
-      CHECK_EQ(dnnLayoutCreate_F32(&lt_user_input, params.in_dim,
-                                   params.in_sizes, params.in_strides),
-               E_SUCCESS);
-
-      // Create PoolingBackward primitive
-      dnnAlgorithm_t algorithm = dnnAlgorithmPoolingAvg;
-      dnnPrimitiveAttributes_t primAttr = nullptr;
-      CHECK_EQ(dnnPoolingCreateBackward_F32(
-                   &prim_pooling_bwd, primAttr, algorithm, lt_user_input,
-                   params.kernel_size, params.kernel_stride, params.in_offset,
-                   dnnBorderZerosAsymm),
-               E_SUCCESS);
-
-      // Create expected outbackprop layout from the primitive.
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
-                   &lt_prim_outbackprop, prim_pooling_bwd, dnnResourceDiffDst),
-               E_SUCCESS);
-
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_workspace, prim_pooling_bwd,
-                                                dnnResourceWorkspace),
-               E_SUCCESS);
-    }
-
-    void MklCleanup() {
-      bool outbackprop_in_mkl_format = out_backprop_shape.IsMklTensor();
-      CHECK_EQ(dnnDelete_F32(prim_pooling_bwd), E_SUCCESS);
-      CHECK_EQ(dnnLayoutDelete_F32(lt_user_input), E_SUCCESS);
-      if (!outbackprop_in_mkl_format) {
-        CHECK_EQ(dnnLayoutDelete_F32(lt_user_outbackprop), E_SUCCESS);
-      }
-      CHECK_EQ(dnnLayoutDelete_F32(lt_prim_outbackprop), E_SUCCESS);
-      CHECK_EQ(dnnLayoutDelete_F32(lt_workspace), E_SUCCESS);
-    }
-  } MklAvgPoolingGradOpContext;
-
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
-  Padding padding_;
-  TensorFormat data_format_;
-};  // MklAvgPoolingGradOp
-
-#else
-
 template <typename Device, typename T>
 class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
  public:
@@ -716,8 +323,6 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
 TF_CALL_float(REGISTER_MKL_AVGPOOL3D_KERNELS);
 TF_CALL_bfloat16(REGISTER_MKL_AVGPOOL3D_KERNELS);
 
-#endif  // INTEL_MKL_ML_ONLY
-
 #define REGISTER_MKL_AVGPOOL_KERNELS(T)                             \
   REGISTER_KERNEL_BUILDER(Name("_MklAvgPool")                       \
                               .Device(DEVICE_CPU)                   \
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index fcc7248340b..e406081d481 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -16,11 +16,15 @@ limitations under the License.
 // See docs in ../ops/nn_ops.cc.
 #ifdef INTEL_MKL
 
+#include "tensorflow/core/kernels/mkl_conv_ops.h"
+
 #include <string.h>
+
 #include <algorithm>
 #include <map>
 #include <vector>
 
+#include "mkldnn.hpp"
 #include "absl/strings/str_join.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
@@ -29,7 +33,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/mkl_conv_ops.h"
 #include "tensorflow/core/kernels/mkl_quantized_conv_ops.h"
 #include "tensorflow/core/kernels/no_op.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -40,28 +43,17 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 
-#include "tensorflow/core/util/mkl_util.h"
-
-#ifndef INTEL_MKL_ML_ONLY
-#include "mkldnn.hpp"
-
 using mkldnn::prop_kind;
 using mkldnn::stream;
 using mkldnn::convolution_forward;
 using mkldnn::convolution_direct;
 
-#else
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
-#endif
-
 namespace tensorflow {
 
-#ifndef INTEL_MKL_ML_ONLY
-
 // This structure aggregates multiple inputs to Conv2DFwd* methods.
 struct MklConvFwdParams {
   memory::dims src_dims;
@@ -403,449 +395,8 @@ class MklConvFwdPrimitiveFactory : public MklPrimitiveFactory<float> {
   }
 };
 
-#endif
-
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-// For now, MKL-ML is default. So making MKL-DNN not a default choice.
-#ifdef INTEL_MKL_ML_ONLY
-template <typename Device, typename T, bool bias_enabled>
-class MklConvOp : public OpKernel {
- public:
-  ~MklConvOp() {}
-
-  explicit MklConvOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
-    string data_format;
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
-    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
-    OP_REQUIRES(context, strides_.size() == 4,
-                errors::InvalidArgument("Sliding window strides field must "
-                                        "specify 4 dimensions"));
-
-    const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
-    const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
-    OP_REQUIRES(
-        context, stride_n == 1 && stride_c == 1,
-        errors::InvalidArgument("Current implementation does not yet support "
-                                "strides in the batch and depth dimensions."));
-    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    MklConv2DOpContext mkl_context;
-    const Tensor& input = MklGetInput(context, 0);
-    GetMklShape(context, 0, &(mkl_context.input_shape));
-    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
-
-    const Tensor& filter = MklGetInput(context, 1);
-    MklShape mkl_filter_shape;
-    GetMklShape(context, 1, &mkl_filter_shape);
-    CHECK(!mkl_filter_shape.IsMklTensor())
-        << "Conv filter should not be in MKL Layout";
-
-    if (bias_enabled) {
-      const Tensor& bias = MklGetInput(context, 2);
-      OP_REQUIRES(context, bias.dims() == 1,
-                  errors::InvalidArgument("bias must be 1-dimensional: ",
-                                          bias.shape().DebugString()));
-    }
-
-    if (!input_in_mkl_format) {
-      OP_REQUIRES(context, input.dims() == 4,
-                  errors::InvalidArgument("input must be 4-dimensional",
-                                          input.shape().DebugString()));
-    }
-
-    OP_REQUIRES(context, filter.dims() == 4,
-                errors::InvalidArgument("filter must be 4-dimensional: ",
-                                        filter.shape().DebugString()));
-
-    for (int i = 0; i < 3; ++i) {
-      OP_REQUIRES(
-          context,
-          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
-          errors::InvalidArgument("filter dimension is too large"));
-    }
-
-    const int64 input_depth =
-        input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'C')
-                            : GetTensorDim(input, data_format_, 'C');
-    OP_REQUIRES(context, input_depth == filter.dim_size(2),
-                errors::InvalidArgument(
-                    "input and filter must have the same depth: ", input_depth,
-                    " vs ", filter.dim_size(2)));
-    // The last dimension for filter is out_depth.
-    const int out_depth = static_cast<int>(filter.dim_size(3));
-
-    // The second dimension for input is rows/height.
-    // The first dimension for filter is rows/height.
-    const int64 input_rows_raw =
-        input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'H')
-                            : GetTensorDim(input, data_format_, 'H');
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(input_rows_raw, std::numeric_limits<int>::max()),
-        errors::InvalidArgument("Input rows are too large"));
-    const int input_rows = static_cast<int>(input_rows_raw);
-    const int filter_rows = static_cast<int>(filter.dim_size(0));
-
-    // The third dimension for input is columns/width.
-    // The second dimension for filter is columns/width.
-    const int64 input_cols_raw =
-        input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'W')
-                            : GetTensorDim(input, data_format_, 'W');
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(input_cols_raw, std::numeric_limits<int>::max()),
-        errors::InvalidArgument("Input cols are too large"));
-    const int input_cols = static_cast<int>(input_cols_raw);
-    const int filter_cols = static_cast<int>(filter.dim_size(1));
-
-    // The first dimension for input is batch.
-    const int64 input_batch_raw =
-        input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'N')
-                            : GetTensorDim(input, data_format_, 'N');
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(input_batch_raw, std::numeric_limits<int>::max()),
-        errors::InvalidArgument("batch is too large"));
-    const int batch = static_cast<int>(input_batch_raw);
-
-    // For now we take the stride from the second and third dimensions only (we
-    // do not support striding on the batch or depth dimension).
-    const int stride_rows = GetTensorDim(strides_, data_format_, 'H');
-    const int stride_cols = GetTensorDim(strides_, data_format_, 'W');
-
-    int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
-    OP_REQUIRES_OK(context,
-                   GetWindowedOutputSize(input_rows, filter_rows, stride_rows,
-                                         padding_, &out_rows, &pad_rows));
-    OP_REQUIRES_OK(context,
-                   GetWindowedOutputSize(input_cols, filter_cols, stride_cols,
-                                         padding_, &out_cols, &pad_cols));
-    TensorShape out_shape =
-        ShapeFromFormat(data_format_, batch, out_rows, out_cols, out_depth);
-
-    // Output tensor is of the following dimensions:
-    // [ in_batch, out_rows, out_cols, out_depth ]
-    Tensor* output = nullptr;
-
-    // If there is nothing to compute, return.
-    if (out_shape.num_elements() == 0) {
-      // Nothing to do, allocate output tensor and return
-      MklShape mkl_output_mkl_shape;
-      mkl_output_mkl_shape.SetMklTensor(false);
-      AllocateOutputSetMklShape(context, 0, &output, input.shape(),
-                                mkl_output_mkl_shape);
-      return;
-    }
-
-    if (batch == 0) {
-      // Nothing to do, allocate output tensor and return
-      MklShape mkl_output_mkl_shape;
-      mkl_output_mkl_shape.SetMklTensor(false);
-      AllocateOutputSetMklShape(context, 0, &output, input.shape(),
-                                mkl_output_mkl_shape);
-      return;
-    }
-
-    // Create MKL convolution primitives
-    mkl_context.in_dims = input_in_mkl_format
-                              ? mkl_context.input_shape.GetDimension()
-                              : input.dims();
-    mkl_context.filter_dims = filter.dims();
-
-    mkl_context.in_sizes[MklDims::W] = static_cast<size_t>(input_cols);
-    mkl_context.in_sizes[MklDims::H] = static_cast<size_t>(input_rows);
-    mkl_context.in_sizes[MklDims::C] = static_cast<size_t>(input_depth);
-    mkl_context.in_sizes[MklDims::N] = static_cast<size_t>(batch);
-
-    mkl_context.out_sizes[MklDims::W] = static_cast<size_t>(out_cols);
-    mkl_context.out_sizes[MklDims::H] = static_cast<size_t>(out_rows);
-    mkl_context.out_sizes[MklDims::C] = static_cast<size_t>(out_depth);
-    mkl_context.out_sizes[MklDims::N] = static_cast<size_t>(batch);
-
-    mkl_context.input_offset[0] = static_cast<int>(-pad_cols);
-    mkl_context.input_offset[1] = static_cast<int>(-pad_rows);
-
-    mkl_context.conv_stride[0] = static_cast<size_t>(stride_cols);
-    mkl_context.conv_stride[1] = static_cast<size_t>(stride_rows);
-
-    GetStridesFromSizes(data_format_, mkl_context.out_strides,
-                        mkl_context.out_sizes);
-    GetStridesFromSizes(data_format_, mkl_context.in_strides,
-                        mkl_context.in_sizes);
-
-    // TF filter dimension order (out_depth, in_depth, cols, rows) ->
-    // MKL filter dimension order (out_depth, in_depth, rows, cols)
-    mkl_context.filter_sizes[0] = filter.dim_size(1);  // cols
-    mkl_context.filter_sizes[1] = filter.dim_size(0);  // rows
-    mkl_context.filter_sizes[2] = filter.dim_size(2);  // in_depth
-    mkl_context.filter_sizes[3] = filter.dim_size(3);  // out_depth
-
-    // TF filter layout - (rows, cols, in_depth, out_depth)
-    mkl_context.filter_strides[0] =
-        filter.dim_size(2) * filter.dim_size(3);  // cols
-    mkl_context.filter_strides[1] =
-        filter.dim_size(1) * filter.dim_size(2) * filter.dim_size(3);  // rows
-    mkl_context.filter_strides[2] = filter.dim_size(3);  // in_depth
-    mkl_context.filter_strides[3] = 1;                   // out_depth
-
-    if (bias_enabled) {
-      const Tensor& bias = MklGetInput(context, 2);
-      mkl_context.bias_sizes[0] = {static_cast<size_t>(bias.dim_size(0))};
-      mkl_context.bias_strides[0] = {1};
-    }
-
-    // Create Convolution Primitive
-    if (bias_enabled) {
-      CHECK_EQ(
-          dnnConvolutionCreateForwardBias_F32(
-              &mkl_context.prim_fwd, nullptr, dnnAlgorithmConvolutionDirect,
-              mkl_context.in_dims, mkl_context.in_sizes, mkl_context.out_sizes,
-              mkl_context.filter_sizes, mkl_context.conv_stride,
-              mkl_context.input_offset, dnnBorderZeros),
-          E_SUCCESS);
-    } else {
-      CHECK_EQ(
-          dnnConvolutionCreateForward_F32(
-              &mkl_context.prim_fwd, nullptr, dnnAlgorithmConvolutionDirect,
-              mkl_context.in_dims, mkl_context.in_sizes, mkl_context.out_sizes,
-              mkl_context.filter_sizes, mkl_context.conv_stride,
-              mkl_context.input_offset, dnnBorderZeros),
-          E_SUCCESS);
-    }
-
-    TensorShape mkl_output_tf_shape;
-    MklShape mkl_output_mkl_shape;
-    mkl_output_mkl_shape.SetMklTensor(true);
-    mkl_output_mkl_shape.SetMklLayout(mkl_context.prim_fwd, dnnResourceDst);
-    mkl_output_mkl_shape.SetTfLayout(mkl_context.in_dims, mkl_context.out_sizes,
-                                     mkl_context.out_strides);
-    // MKL might change the dimension ordering
-    // Create mapping to recover the original TF dimension order
-    mkl_output_mkl_shape.SetTfDimOrder(mkl_context.in_dims, data_format_);
-
-    mkl_output_tf_shape.AddDim(
-        dnnLayoutGetMemorySize_F32(
-            static_cast<dnnLayout_t>(mkl_output_mkl_shape.GetMklLayout())) /
-        sizeof(T));
-    AllocateOutputSetMklShape(context, 0, &output, mkl_output_tf_shape,
-                              mkl_output_mkl_shape);
-    // Filter output to be used in the backprop_input
-    TensorShape mkl_filter_output_tf_shape;
-    MklShape mkl_filter_output_mkl_shape;
-    mkl_filter_output_mkl_shape.SetMklTensor(true);
-    mkl_filter_output_mkl_shape.SetMklLayout(mkl_context.prim_fwd,
-                                             dnnResourceFilter);
-
-    size_t filter_sizes[4] = {static_cast<size_t>(filter.dim_size(0)),
-                              static_cast<size_t>(filter.dim_size(1)),
-                              static_cast<size_t>(filter.dim_size(2)),
-                              static_cast<size_t>(filter.dim_size(3))};
-    mkl_filter_output_mkl_shape.SetTfLayout(filter.dims(), filter_sizes,
-                                            mkl_context.filter_strides);
-
-    mkl_filter_output_mkl_shape.SetTfDimOrder(mkl_context.filter_dims,
-                                              data_format_);
-    mkl_filter_output_tf_shape.AddDim(
-        dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
-            mkl_filter_output_mkl_shape.GetMklLayout())) /
-        sizeof(T));
-    AllocateOutputSetMklShape(context, 1, &mkl_context.output_filter,
-                              mkl_filter_output_tf_shape,
-                              mkl_filter_output_mkl_shape);
-
-    mkl_context.conv_res[dnnResourceDst] =
-        static_cast<void*>(output->flat<T>().data());
-
-    mkl_context.MklCreateInputLayouts(context);
-
-    // Temp tensor used to allocate tmp buffers
-    Tensor mkl_tmp_input_buf_tensor, mkl_tmp_filter_buf_tensor,
-        mkl_tmp_bias_buf_tensor;
-    mkl_context.MklPrepareConvolutionInputs(context, &mkl_tmp_input_buf_tensor,
-                                            &mkl_tmp_filter_buf_tensor,
-                                            &mkl_tmp_bias_buf_tensor);
-
-    // Execute convolution
-    CHECK_EQ(dnnExecute_F32(mkl_context.prim_fwd, mkl_context.conv_res),
-             E_SUCCESS);
-
-    mkl_context.MklCleanup();
-  }
-
- private:
-  typedef struct {
-    int in_dims;
-    size_t in_sizes[4];
-    size_t in_strides[4];
-    size_t out_sizes[4];
-    size_t out_strides[4];
-    int filter_dims;
-    size_t filter_sizes[4];
-    size_t filter_strides[4];
-    size_t bias_sizes[1];
-    size_t bias_strides[1];
-    int input_offset[2];
-    size_t conv_stride[2];
-    MklShape input_shape;
-    dnnPrimitive_t prim_fwd;
-    void* conv_res[dnnResourceNumber];
-    dnnLayout_t lt_filter, lt_bias, lt_input;
-    Tensor* output_filter = nullptr;
-
-    // Create MKL dnnLayout_t objects for tensors coming into the layer
-    void MklCreateInputLayouts(OpKernelContext* context) {
-      bool input_in_mkl_format = input_shape.IsMklTensor();
-      if (input_in_mkl_format) {
-        lt_input = static_cast<dnnLayout_t>(input_shape.GetCurLayout());
-      } else {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
-                 E_SUCCESS);
-      }
-
-      CHECK_EQ(dnnLayoutCreate_F32(&lt_filter, filter_dims, filter_sizes,
-                                   filter_strides),
-               E_SUCCESS);
-
-      if (bias_enabled) {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_bias, 1, bias_sizes, bias_strides),
-                 E_SUCCESS);
-      }
-    }
-
-    // Compare incoming tensor layouts with MKL preferred layouts and convert
-    // data to the preferred layout if necessary
-    void MklPrepareConvolutionInputs(OpKernelContext* context,
-                                     Tensor* mkl_tmp_input_buf_tensor,
-                                     Tensor* mkl_tmp_filter_buf_tensor,
-                                     Tensor* mkl_tmp_bias_buf_tensor) {
-      bool mkl_convert_input, mkl_convert_filter, mkl_convert_bias;
-      dnnPrimitive_t mkl_prim_convert_filter, mkl_prim_convert_bias,
-          mkl_prim_convert_input;
-      dnnLayout_t mkl_lt_internal_filter, mkl_lt_internal_bias,
-          mkl_lt_internal_input;
-      void *mkl_buf_convert_input, *mkl_buf_convert_filter,
-          *mkl_buf_convert_bias;
-      mkl_prim_convert_filter = nullptr;
-      mkl_prim_convert_bias = nullptr;
-      mkl_prim_convert_input = nullptr;
-      mkl_lt_internal_filter = nullptr;
-      mkl_lt_internal_bias = nullptr;
-      mkl_lt_internal_input = nullptr;
-      mkl_buf_convert_input = nullptr;
-      mkl_buf_convert_filter = nullptr;
-      mkl_buf_convert_bias = nullptr;
-
-      // Compare with internal layouts and convert if needed
-      const Tensor& input = MklGetInput(context, 0);
-      void* mkl_buf_input =
-          const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_input,
-                                                prim_fwd, dnnResourceSrc),
-               E_SUCCESS);
-      mkl_convert_input =
-          !dnnLayoutCompare_F32(mkl_lt_internal_input, lt_input);
-      if (mkl_convert_input) {
-        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, lt_input,
-                                         mkl_lt_internal_input),
-                 E_SUCCESS);
-        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
-                       &mkl_buf_convert_input);
-        CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
-                                          mkl_buf_convert_input),
-                 E_SUCCESS);
-        dnnDelete_F32(mkl_prim_convert_input);
-      }
-      dnnLayoutDelete_F32(mkl_lt_internal_input);
-
-      conv_res[dnnResourceSrc] =
-          (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input;
-
-      const Tensor& filter = MklGetInput(context, 1);
-      void* mkl_buf_filter =
-          const_cast<void*>(static_cast<const void*>(filter.flat<T>().data()));
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_filter,
-                                                prim_fwd, dnnResourceFilter),
-               E_SUCCESS);
-      mkl_convert_filter =
-          !dnnLayoutCompare_F32(mkl_lt_internal_filter, lt_filter);
-      if (mkl_convert_filter) {
-        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_filter, lt_filter,
-                                         mkl_lt_internal_filter),
-                 E_SUCCESS);
-
-        mkl_buf_convert_filter = const_cast<void*>(
-            static_cast<const void*>(output_filter->flat<T>().data()));
-
-        CHECK_EQ(
-            dnnConversionExecute_F32(mkl_prim_convert_filter, mkl_buf_filter,
-                                     mkl_buf_convert_filter),
-            E_SUCCESS);
-        dnnDelete_F32(mkl_prim_convert_filter);
-      }
-      dnnLayoutDelete_F32(mkl_lt_internal_filter);
-
-      conv_res[dnnResourceFilter] =
-          (mkl_convert_filter) ? mkl_buf_convert_filter : mkl_buf_filter;
-
-      if (bias_enabled) {
-        const Tensor& bias = MklGetInput(context, 2);
-        void* mkl_buf_bias =
-            const_cast<void*>(static_cast<const void*>(bias.flat<T>().data()));
-        CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_bias,
-                                                  prim_fwd, dnnResourceBias),
-                 E_SUCCESS);
-        mkl_convert_bias = !dnnLayoutCompare_F32(mkl_lt_internal_bias, lt_bias);
-        if (mkl_convert_bias) {
-          CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_bias, lt_bias,
-                                           mkl_lt_internal_bias),
-                   E_SUCCESS);
-          AllocTmpBuffer(context, mkl_tmp_bias_buf_tensor, mkl_lt_internal_bias,
-                         &mkl_buf_convert_bias);
-          CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_bias, mkl_buf_bias,
-                                            mkl_buf_convert_bias),
-                   E_SUCCESS);
-          dnnDelete_F32(mkl_prim_convert_bias);
-        }
-        dnnLayoutDelete_F32(mkl_lt_internal_bias);
-
-        conv_res[dnnResourceBias] =
-            (mkl_convert_bias) ? mkl_buf_convert_bias : mkl_buf_bias;
-      }
-    }
-
-    void MklCleanup() {
-      bool input_in_mkl_format = input_shape.IsMklTensor();
-      dnnDelete_F32(prim_fwd);
-      if (!input_in_mkl_format) dnnLayoutDelete_F32(lt_input);
-      dnnLayoutDelete_F32(lt_filter);
-      if (bias_enabled) dnnLayoutDelete_F32(lt_bias);
-    }
-  } MklConv2DOpContext;
-
-  std::vector<int32> strides_;
-  Padding padding_;
-  TensorFormat data_format_;
-};
-
-// FP32 kernel registration for INTEL_MKL_ML
-REGISTER_KERNEL_BUILDER(Name("_MklConv2D")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T")
-                            .Label(mkl_op_registry::kMklOpLabel),
-                        MklConv2DOp<CPUDevice, float, false>);
-REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBias")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T")
-                            .Label(mkl_op_registry::kMklOpLabel),
-                        MklConv2DOp<CPUDevice, float, true>);
-
-#else
-
 // Base class for convolution forward operations
 template <typename Device, typename Tinput, typename Tfilter, typename Tbias,
           typename Toutput, typename Ttemp_output, typename Tpadding,
@@ -2218,8 +1769,6 @@ REGISTER_KERNEL_BUILDER(
         .Label(mkl_op_registry::kMklQuantizedOpLabel),
     MklQuantizedConv2DReluOp<CPUDevice, qint32, quint8, quint8, true, true>);
 
-#endif  // INTEL_MKL_ML
-
 // Register 2D operations
 #define REGISTER_MKL_CPU_2D(T)                                          \
   REGISTER_KERNEL_BUILDER(                                              \
diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc
index 8279691abed..0e30eb53550 100644
--- a/tensorflow/core/kernels/mkl_maxpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc
@@ -16,6 +16,10 @@ limitations under the License.
 // See docs in ../ops/nn_ops.cc.
 #ifdef INTEL_MKL
 #define EIGEN_USE_THREADS
+
+#include <algorithm>
+
+#include "mkldnn.hpp"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
@@ -23,9 +27,6 @@ limitations under the License.
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/padding.h"
 
-#ifndef INTEL_MKL_ML_ONLY
-#include <algorithm>
-#include "mkldnn.hpp"
 using mkldnn::algorithm;
 using mkldnn::engine;
 using mkldnn::error;
@@ -34,471 +35,11 @@ using mkldnn::padding_kind;
 using mkldnn::pooling_backward;
 using mkldnn::pooling_forward;
 using mkldnn::prop_kind;
-#endif
 
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-// MKL-DNN is now default. MKL-ML must be specified explicitly.
-#ifdef INTEL_MKL_ML_ONLY
-
-// An implementation of MaxPooling (forward).
-template <typename Device, typename T>
-class MklMaxPoolingOp : public OpKernel {
- public:
-  explicit MklMaxPoolingOp(OpKernelConstruction* context) : OpKernel(context) {
-    string data_format;
-
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
-    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
-    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
-    OP_REQUIRES(context, ksize_.size() == 4,
-                errors::InvalidArgument("Sliding window ksize field must "
-                                        "specify 4 dimensions"));
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
-    OP_REQUIRES(context, stride_.size() == 4,
-                errors::InvalidArgument("Sliding window stride field must "
-                                        "specify 4 dimensions"));
-    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
-                errors::Unimplemented("Pooling is not yet supported on the "
-                                      "batch dimension."));
-
-    workspace_enabled_ = false;
-    // We may not get this attribute for this node if it does not go through
-    // graph rewrite pass. So we do not check for error while retrieving this
-    // attribute value.
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("workspace_enabled", &workspace_enabled_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    MklMaxPoolingOpContext mkl_context;
-    // Get the input tensor
-    const Tensor& tensor_in = MklGetInput(context, 0);
-    GetMklShape(context, 0, &mkl_context.input_shape);
-    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
-
-    mkl_context.params.in_dim = 4;
-    MklPoolParameters pool_params;
-    if (input_in_mkl_format == false) {
-      pool_params.Init(context, ksize_, stride_, padding_, data_format_,
-                       tensor_in.shape());
-      OP_REQUIRES(
-          context, (pool_params.depth_window == 1),
-          errors::Unimplemented("Depthwise max pooling not supported by MKL"));
-
-    } else {
-      pool_params.Init(context, ksize_, stride_, padding_, data_format_,
-                       &mkl_context.input_shape);
-    }
-
-    // Extract the parameters for the op from the pooling specs
-
-    ExtractMklOpParams(context, data_format_, pool_params, &mkl_context.params);
-
-    mkl_context.MklCreateLayoutsAndPrimitives(context);
-    OP_REQUIRES_OK(context, context->status());
-
-    // Declare output tensor
-    TensorShape tensor_out_shape;
-    MklShape mkl_out_shape, mkl_workspace_shape;
-    mkl_out_shape.SetMklTensor(true);
-    mkl_out_shape.SetMklLayout(mkl_context.prim_pooling_fwd, dnnResourceDst);
-    mkl_out_shape.SetTfLayout(mkl_context.params.in_dim,
-                              mkl_context.params.out_sizes,
-                              mkl_context.params.out_strides);
-    mkl_out_shape.SetTfDimOrder(mkl_context.params.in_dim, data_format_);
-
-    Tensor* output_tensor = nullptr;
-    tensor_out_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
-                                mkl_out_shape.GetMklLayout())) /
-                            sizeof(T));
-    AllocateOutputSetMklShape(context, 0, &output_tensor, tensor_out_shape,
-                              mkl_out_shape);
-
-    Tensor* workspace_tensor;
-    void* workspace_buf = nullptr;
-
-    TensorShape workspace_shape;
-    mkl_workspace_shape.SetMklTensor(false);
-    workspace_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
-                               mkl_context.lt_workspace)) /
-                           sizeof(T));
-    AllocateOutputSetMklShape(context, 1, &workspace_tensor, workspace_shape,
-                              mkl_workspace_shape);
-
-    mkl_context.pooling_res[dnnResourceWorkspace] = const_cast<void*>(
-        static_cast<const void*>(workspace_tensor->flat<T>().data()));
-    mkl_context.pooling_res[dnnResourceSrc] =
-        const_cast<void*>(static_cast<const void*>(tensor_in.flat<T>().data()));
-    mkl_context.pooling_res[dnnResourceDst] = const_cast<void*>(
-        static_cast<const void*>(output_tensor->flat<T>().data()));
-
-    CHECK_EQ(
-        dnnExecute_F32(mkl_context.prim_pooling_fwd, mkl_context.pooling_res),
-        E_SUCCESS);
-
-    mkl_context.MklCleanup();
-  }
-
- private:
-  typedef struct {
-    MklPoolingOpParams params;
-    MklShape input_shape;
-    void* pooling_res[dnnResourceNumber];
-    dnnPrimitive_t prim_pooling_fwd = nullptr;
-    dnnLayout_t lt_user_input = nullptr, lt_workspace = nullptr;
-
-    void MklCreateLayoutsAndPrimitives(OpKernelContext* context) {
-      bool input_in_mkl_format = input_shape.IsMklTensor();
-      // Create or use existing DNN user layout
-      if (input_in_mkl_format == false) {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_user_input, params.in_dim,
-                                     params.in_sizes, params.in_strides),
-                 E_SUCCESS);
-      } else {
-        lt_user_input = (dnnLayout_t)input_shape.GetCurLayout();
-      }
-
-      dnnAlgorithm_t algorithm = dnnAlgorithmPoolingMax;
-      dnnPrimitiveAttributes_t primAttr = nullptr;
-
-      // Create DNN primitives
-      CHECK_EQ(dnnPoolingCreateForward_F32(
-                   &prim_pooling_fwd, primAttr, algorithm, lt_user_input,
-                   params.kernel_size, params.kernel_stride, params.in_offset,
-                   dnnBorderZerosAsymm),
-               E_SUCCESS);
-
-      // Creates layout for the workspace
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_workspace, prim_pooling_fwd,
-                                                dnnResourceWorkspace),
-               E_SUCCESS);
-    }
-
-    void MklCleanup() {
-      bool input_in_mkl_format = input_shape.IsMklTensor();
-      CHECK_EQ(dnnDelete_F32(prim_pooling_fwd), E_SUCCESS);
-      if (!input_in_mkl_format) {
-        CHECK_EQ(dnnLayoutDelete_F32(lt_user_input), E_SUCCESS);
-      }
-      CHECK_EQ(dnnLayoutDelete_F32(lt_workspace), E_SUCCESS);
-    }
-  } MklMaxPoolingOpContext;
-
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
-  Padding padding_;
-  TensorFormat data_format_;
-  bool workspace_enabled_;
-};
-
-// The operation to compute MaxPool gradients.
-// It takes three inputs:
-//   - The original input tensor
-//   - The original output tensor
-//   - Backprop tensor for output
-// It produces one output: backprop tensor for input.
-template <class Device, class T>
-class MklMaxPoolingGradOp : public OpKernel {
- public:
-  explicit MklMaxPoolingGradOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    string data_format;
-
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
-    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
-    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
-    OP_REQUIRES(context, ksize_.size() == 4,
-                errors::InvalidArgument("Sliding window ksize field must "
-                                        "specify 4 dimensions"));
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
-    OP_REQUIRES(context, stride_.size() == 4,
-                errors::InvalidArgument("Sliding window strides field must "
-                                        "specify 4 dimensions"));
-    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
-                errors::Unimplemented(
-                    "Pooling is not yet supported on the batch dimension."));
-    workspace_enabled_ = false;
-    // We may not get this attribute for this node if it does not go through
-    // graph rewrite pass. So we do not check for error while retrieving this
-    // attribute value.
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("workspace_enabled", &workspace_enabled_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    MklMaxPoolingGradOpContext mkl_context;
-    // Input - The original input tensor
-    const Tensor& tensor_in = MklGetInput(context, 0);
-
-    // Output - Backprop tensor for input.
-    Tensor* output_tensor = nullptr;
-
-    GetMklShape(context, 0, &mkl_context.input_shape);
-    GetMklShape(context, 2, &mkl_context.output_backprop_shape);
-    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
-
-    if (input_in_mkl_format == false)
-      mkl_context.params.in_dim = tensor_in.dims();
-    else
-      mkl_context.params.in_dim = mkl_context.input_shape.GetDimension();
-
-    MklPoolParameters pool_params;
-    if (input_in_mkl_format == false) {
-      pool_params.Init(context, ksize_, stride_, padding_, data_format_,
-                       tensor_in.shape());
-      OP_REQUIRES(
-          context, (pool_params.depth_window == 1),
-          errors::Unimplemented("Depthwise max pooling not supported by MKL"));
-
-    } else {
-      pool_params.Init(context, ksize_, stride_, padding_, data_format_,
-                       &mkl_context.input_shape);
-    }
-
-    // Extract the parameters for the op from the pooling specs
-    ExtractMklOpParams(context, data_format_, pool_params, &mkl_context.params);
-
-    mkl_context.MklCreateLayouts(context);
-    OP_REQUIRES_OK(context, context->status());
-
-    mkl_context.MklCreatePrimitives(context, workspace_enabled_);
-    OP_REQUIRES_OK(context, context->status());
-
-    mkl_context.MklPrepareInputs(context, workspace_enabled_);
-    OP_REQUIRES_OK(context, context->status());
-
-    // Create shape for the input back prop output
-    TensorShape mkl_input_backprop;
-    MklShape mkl_output_shape;
-    mkl_output_shape.SetMklTensor(true);
-    mkl_output_shape.SetMklLayout(mkl_context.prim_pooling_bwd,
-                                  dnnResourceDiffSrc);
-    mkl_output_shape.SetTfLayout(mkl_context.params.in_dim,
-                                 mkl_context.params.in_sizes,
-                                 mkl_context.params.in_strides);
-    mkl_output_shape.SetTfDimOrder(mkl_context.params.in_dim, data_format_);
-
-    mkl_input_backprop.AddDim(
-        dnnLayoutGetMemorySize_F32(
-            static_cast<dnnLayout_t>(mkl_output_shape.GetMklLayout())) /
-        sizeof(T));
-    AllocateOutputSetMklShape(context, 0, &output_tensor, mkl_input_backprop,
-                              mkl_output_shape);
-    mkl_context.pooling_res[dnnResourceDiffSrc] = const_cast<void*>(
-        static_cast<const void*>(output_tensor->flat<T>().data()));
-
-    CHECK_EQ(
-        dnnExecute_F32(mkl_context.prim_pooling_bwd, mkl_context.pooling_res),
-        E_SUCCESS);
-
-    mkl_context.MklCleanup(workspace_enabled_);
-  }
-
- private:
-  typedef struct {
-    MklPoolingOpParams params;
-    MklShape input_shape, output_backprop_shape;
-    void* pooling_resfwd[dnnResourceNumber];
-    void* pooling_res[dnnResourceNumber];
-    dnnPrimitive_t prim_pooling_fwd = nullptr, prim_pooling_bwd = nullptr,
-                   convert_input = nullptr, convert_outbackprop = nullptr;
-    dnnLayout_t lt_outbackprop_user = nullptr, lt_outbackprop_prim = nullptr,
-                lt_input_user = nullptr, lt_input_prim = nullptr;
-    void* input_buf;
-    void* outbackprop_buf;
-    Tensor tmp_output_buf_tensor;
-    Tensor workspace_buf_tensor;
-    Tensor input_buf_tensor, outbackprop_buf_tensor;
-
-    void MklCreateLayouts(OpKernelContext* context) {
-      bool input_in_mkl_format = input_shape.IsMklTensor();
-      bool outbackprop_in_mkl_format = output_backprop_shape.IsMklTensor();
-      // Create DNN user layout for input and outbackprop or get existing layout
-      if (input_in_mkl_format == false) {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_input_user, params.in_dim,
-                                     params.in_sizes, params.in_strides),
-                 E_SUCCESS);
-      } else {
-        lt_input_user = (dnnLayout_t)input_shape.GetCurLayout();
-      }
-
-      // We don't care about the output layout for now as we can create it from
-      // primitives for the max pooling fwd prop
-      if (outbackprop_in_mkl_format == false) {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_outbackprop_user, params.in_dim,
-                                     params.out_sizes, params.out_strides),
-                 E_SUCCESS);
-      } else {
-        lt_outbackprop_user = (dnnLayout_t)output_backprop_shape.GetCurLayout();
-      }
-    }
-
-    // Create DNN primitives
-    void MklCreatePrimitives(OpKernelContext* context, bool workspace_enabled) {
-      dnnAlgorithm_t algorithm = dnnAlgorithmPoolingMax;
-      dnnPrimitiveAttributes_t primAttr = nullptr;
-
-      if (workspace_enabled == false) {
-        CHECK_EQ(dnnPoolingCreateForward_F32(
-                     &prim_pooling_fwd, primAttr, algorithm, lt_input_user,
-                     params.kernel_size, params.kernel_stride, params.in_offset,
-                     dnnBorderZerosAsymm),
-                 E_SUCCESS);
-      }
-
-      CHECK_EQ(dnnPoolingCreateBackward_F32(
-                   &prim_pooling_bwd, primAttr, algorithm, lt_input_user,
-                   params.kernel_size, params.kernel_stride, params.in_offset,
-                   dnnBorderZerosAsymm),
-               E_SUCCESS);
-
-      // Creates conversions
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
-                   &lt_outbackprop_prim, prim_pooling_bwd, dnnResourceDiffDst),
-               E_SUCCESS);
-
-      if (workspace_enabled == false) {
-        CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
-                     &lt_input_prim, prim_pooling_fwd, dnnResourceSrc),
-                 E_SUCCESS);
-        if (!dnnLayoutCompare_F32(lt_input_user, lt_input_prim)) {
-          CHECK_EQ(dnnConversionCreate_F32(&convert_input, lt_input_user,
-                                           lt_input_prim),
-                   E_SUCCESS);
-          AllocTmpBuffer(context, &input_buf_tensor, lt_input_prim, &input_buf);
-        }
-      }
-
-      if (!dnnLayoutCompare_F32(lt_outbackprop_user, lt_outbackprop_prim)) {
-        CHECK_EQ(
-            dnnConversionCreate_F32(&convert_outbackprop, lt_outbackprop_user,
-                                    lt_outbackprop_prim),
-            E_SUCCESS);
-        AllocTmpBuffer(context, &outbackprop_buf_tensor, lt_outbackprop_prim,
-                       &outbackprop_buf);
-      }
-    }
-
-    // Compare incoming tensor layouts with MKL preferred layouts and convert
-    // data to the preferred layout if necessary
-    void MklPrepareInputs(OpKernelContext* context, bool workspace_enabled) {
-      const Tensor& tensor_in = MklGetInput(context, 0);
-      const Tensor& out_backprop = MklGetInput(context, 2);
-      bool input_in_mkl_format = input_shape.IsMklTensor();
-      bool outbackprop_in_mkl_format = output_backprop_shape.IsMklTensor();
-
-      void* tmp_output_buf = nullptr;
-      void* workspace_buf = nullptr;
-
-      if (workspace_enabled == false) {
-        if (convert_input != nullptr) {
-          if (input_in_mkl_format == false) {
-            CHECK_EQ(dnnConversionExecute_F32(
-                         convert_input,
-                         const_cast<void*>(static_cast<const void*>(
-                             tensor_in.flat<T>().data())),
-                         input_buf),
-                     E_SUCCESS);
-            CHECK_EQ(dnnDelete_F32(convert_input), E_SUCCESS);
-            convert_input = nullptr;
-          } else {
-            input_shape.GetConvertedFlatData(
-                lt_input_prim,
-                const_cast<void*>(
-                    static_cast<const void*>(tensor_in.flat<T>().data())),
-                input_buf);
-          }
-          pooling_resfwd[dnnResourceSrc] = input_buf;
-        } else {
-          pooling_resfwd[dnnResourceSrc] = const_cast<void*>(
-              static_cast<const void*>(tensor_in.flat<T>().data()));
-        }
-
-        dnnLayout_t lt_workspace;
-        CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
-                     &lt_workspace, prim_pooling_fwd, dnnResourceWorkspace),
-                 E_SUCCESS);
-        AllocTmpBuffer(context, &workspace_buf_tensor, lt_workspace,
-                       &workspace_buf);
-        pooling_resfwd[dnnResourceWorkspace] = workspace_buf;
-
-        dnnLayoutDelete_F32(lt_workspace);
-
-        // We create the layout for max pooling fwd prop tmp output here
-        AllocTmpBuffer(context, &tmp_output_buf_tensor, lt_outbackprop_prim,
-                       &tmp_output_buf);
-        pooling_resfwd[dnnResourceDst] = tmp_output_buf;
-
-        CHECK_EQ(dnnExecute_F32(prim_pooling_fwd, pooling_resfwd), E_SUCCESS);
-        pooling_res[dnnResourceWorkspace] =
-            pooling_resfwd[dnnResourceWorkspace];
-      } else {
-        const Tensor& workspace = MklGetInput(context, 3);
-        pooling_res[dnnResourceWorkspace] = const_cast<void*>(
-            static_cast<const void*>(workspace.flat<T>().data()));
-      }
-
-      // Out backprop conversions if needed
-      if (convert_outbackprop != nullptr) {
-        if (outbackprop_in_mkl_format == false) {
-          CHECK_EQ(dnnConversionExecute_F32(
-                       convert_outbackprop,
-                       const_cast<void*>(static_cast<const void*>(
-                           out_backprop.flat<T>().data())),
-                       outbackprop_buf),
-                   E_SUCCESS);
-          CHECK_EQ(dnnDelete_F32(convert_outbackprop), E_SUCCESS);
-        } else {
-          output_backprop_shape.GetConvertedFlatData(
-              lt_outbackprop_prim,
-              const_cast<void*>(
-                  static_cast<const void*>(out_backprop.flat<T>().data())),
-              outbackprop_buf);
-        }
-        pooling_res[dnnResourceDiffDst] = outbackprop_buf;
-      } else {
-        pooling_res[dnnResourceDiffDst] = const_cast<void*>(
-            static_cast<const void*>(out_backprop.flat<T>().data()));
-      }
-    }
-
-    void MklCleanup(bool workspace_enabled) {
-      bool input_in_mkl_format = input_shape.IsMklTensor();
-      bool outbackprop_in_mkl_format = output_backprop_shape.IsMklTensor();
-      if (workspace_enabled == false) {
-        CHECK_EQ(dnnDelete_F32(prim_pooling_fwd), E_SUCCESS);
-      }
-      CHECK_EQ(dnnDelete_F32(prim_pooling_bwd), E_SUCCESS);
-      if (outbackprop_in_mkl_format == false) {
-        CHECK_EQ(dnnLayoutDelete_F32(lt_outbackprop_user), E_SUCCESS);
-      }
-      CHECK_EQ(dnnLayoutDelete_F32(lt_outbackprop_prim), E_SUCCESS);
-      if (input_in_mkl_format == false) {
-        CHECK_EQ(dnnLayoutDelete_F32(lt_input_user), E_SUCCESS);
-      }
-      if (workspace_enabled == false) {
-        CHECK_EQ(dnnLayoutDelete_F32(lt_input_prim), E_SUCCESS);
-      }
-    }
-  } MklMaxPoolingGradOpContext;
-
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
-  Padding padding_;
-  TensorFormat data_format_;
-
-  bool workspace_enabled_;
-};  // MklMaxPoolingGradOp
-
-#else
-
 // An implementation of MaxPooling (forward).
 template <typename Device, typename T>
 class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
@@ -879,8 +420,6 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
 TF_CALL_float(REGISTER_MKL_MAXPOOL3D_KERNELS);
 TF_CALL_bfloat16(REGISTER_MKL_MAXPOOL3D_KERNELS);
 
-#endif  // INTEL_MKL_ML_ONLY
-
 #define REGISTER_MKL_MAXPOOL_KERNELS(T)                             \
   REGISTER_KERNEL_BUILDER(Name("_MklMaxPool")                       \
                               .Device(DEVICE_CPU)                   \
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
index 349ffcb8887..30f7b3f38f7 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
@@ -16,16 +16,15 @@ limitations under the License.
 #ifdef INTEL_MKL
 
 #include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
+
 #include <limits>
 #include <vector>
+
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 
 namespace tensorflow {
-
-#ifndef INTEL_MKL_ML_ONLY
-
 using mkldnn::pooling_avg;
 using mkldnn::pooling_avg_exclude_padding;
 using mkldnn::pooling_avg_include_padding;
@@ -220,8 +219,6 @@ void MklPoolingBwdPrimitive<T>::Execute(const T* diff_dst_data,
 template class MklPoolingBwdPrimitive<float>;
 template class MklPoolingBwdPrimitive<bfloat16>;
 
-#endif  // !INTEL_MKL_ML_ONLY
-
 // Initialization for TensorFlow format
 void MklPoolParameters::Init(OpKernelContext* context,
                              const std::vector<int32>& ksize,
@@ -249,22 +246,6 @@ void MklPoolParameters::Init(OpKernelContext* context,
   Init(context, ksize, stride, padding, data_format);
 }
 
-#ifdef INTEL_MKL_ML_ONLY
-// Initialization for MKL format
-void MklPoolParameters::Init(OpKernelContext* context,
-                             const std::vector<int32>& ksize,
-                             const std::vector<int32>& stride, Padding padding,
-                             TensorFormat data_format,
-                             const MklShape* mklInputShape) {
-  // Get the input sizes
-  depth = mklInputShape->GetSizes()[2];
-  tensor_in_cols = mklInputShape->GetSizes()[0];
-  tensor_in_rows = mklInputShape->GetSizes()[1];
-  tensor_in_batch = mklInputShape->GetSizes()[3];
-
-  Init(context, ksize, stride, padding, data_format);
-}
-#else
 // Initialization for MKL format
 void MklPoolParameters::Init(OpKernelContext* context,
                              const std::vector<int32>& ksize,
@@ -289,7 +270,7 @@ void MklPoolParameters::Init(OpKernelContext* context,
 
   Init(context, ksize, stride, padding, data_format);
 }
-#endif  // INTEL_MKL_ML_ONLY
+
 // Common Initialization for TensorFlow and MKL formats
 void MklPoolParameters::Init(OpKernelContext* context,
                              const std::vector<int32>& ksize,
@@ -357,7 +338,7 @@ void MklPoolParameters::Init(OpKernelContext* context,
     OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
                                 tensor_in_cols, window_cols, col_stride,
                                 padding, &out_width, &pad_left, &pad_right));
-#ifndef INTEL_MKL_ML_ONLY
+
     // TF can work with int64, but mkldnn only supports int32
     // Fail if the depth, height or width are greater than MAX_INT
     // We check depth only for 3D pooling case
@@ -375,7 +356,7 @@ void MklPoolParameters::Init(OpKernelContext* context,
     OP_REQUIRES(context,
                 FastBoundsCheck(out_width, std::numeric_limits<int>::max()),
                 errors::InvalidArgument("output width is too large"));
-#endif
+
     out_depth = depth;  // output will have the same depth as the input
   } else {              // we are pooling in the depth dimension
     // Our current version of depthwise max pooling does not support
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index 6e42b70d149..ec440a0aedf 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -20,21 +20,17 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <vector>
+
+#include "mkldnn.hpp"
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/padding.h"
 
-#ifndef INTEL_MKL_ML_ONLY
-#include "mkldnn.hpp"
 using mkldnn::memory;
 using mkldnn::pooling_backward;
 using mkldnn::pooling_forward;
 using mkldnn::stream;
-#endif
 
 namespace tensorflow {
-
-#ifndef INTEL_MKL_ML_ONLY
-
 using mkldnn::memory;
 using mkldnn::pooling_avg;
 using mkldnn::pooling_avg_exclude_padding;
@@ -357,7 +353,6 @@ class MklPoolingBwdPrimitiveFactory : public MklPrimitiveFactory<T> {
     this->SetOp(key, op);
   }
 };
-#endif
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
@@ -424,15 +419,9 @@ struct MklPoolParameters {
   void Init(OpKernelContext* context, const std::vector<int32>& ksize,
             const std::vector<int32>& stride, Padding padding,
             TensorFormat data_format, const TensorShape& tensor_in_shape);
-#ifdef INTEL_MKL_ML_ONLY
-  void Init(OpKernelContext* context, const std::vector<int32>& ksize,
-            const std::vector<int32>& stride, Padding padding,
-            TensorFormat data_format, const MklShape* mkl_in_shape);
-#else
   void Init(OpKernelContext* context, const std::vector<int32>& ksize,
             const std::vector<int32>& stride, Padding padding,
             TensorFormat data_format, const MklDnnShape* mkl_in_shape);
-#endif
 
  private:
   // Common initialization for TensorFlow and MKL formats
@@ -441,8 +430,6 @@ struct MklPoolParameters {
             TensorFormat data_format);
 };
 
-#ifndef INTEL_MKL_ML_ONLY
-
 template <class T>
 class MklPoolingOpBase : public OpKernel {
  public:
@@ -750,7 +737,6 @@ class MklPoolingBackwardOpBase : public MklPoolingOpBase<T> {
     return grad_reorder_needed ? target_diff_dst_md : original_input_grad_md;
   }
 };
-#endif  // INTEL_MKL_ML_ONLY
 
 //-------------------------------------------------------------------
 // Utility functions
diff --git a/tensorflow/core/kernels/mkl_slice_op.cc b/tensorflow/core/kernels/mkl_slice_op.cc
index d0832f2b295..5d238a24bc6 100644
--- a/tensorflow/core/kernels/mkl_slice_op.cc
+++ b/tensorflow/core/kernels/mkl_slice_op.cc
@@ -16,8 +16,9 @@ limitations under the License.
 // See docs in ../ops/array_ops.cc.
 
 #ifdef INTEL_MKL
-#ifndef INTEL_MKL_ML_ONLY
 
+#include "mkldnn.hpp"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -25,9 +26,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/prefetch.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-
-#include "mkldnn.hpp"
 #include "tensorflow/core/util/mkl_util.h"
 
 using mkldnn::stream;
@@ -490,5 +488,4 @@ TF_CALL_bfloat16(REGISTER_MKL_SLICE);
 
 }  // namespace tensorflow
 
-#endif  // INTEL_MKL_DNN
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_transpose_op.cc b/tensorflow/core/kernels/mkl_transpose_op.cc
index e89aa1eb760..d3025d34d87 100644
--- a/tensorflow/core/kernels/mkl_transpose_op.cc
+++ b/tensorflow/core/kernels/mkl_transpose_op.cc
@@ -184,9 +184,10 @@ Status MklTransposeCpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
       case DT_FLOAT:
         return MKLTransposeND<float>(ctx, in, out, perm);
         break;
-      case DT_BFLOAT16:
-        return MKLTransposeND<bfloat16>(ctx, in, out, perm);
-        break;
+      // TODO(nhasabni): Enable this case when we turn on bfloat16 compilation.
+      // case DT_BFLOAT16:
+      //  return MKLTransposeND<bfloat16>(ctx, in, out, perm);
+      //  break;
       // TODO(nhasabni): support other types such as INT8.
       default:
         break;
@@ -231,9 +232,10 @@ Status MklConjugateTransposeCpuOp::DoTranspose(OpKernelContext* ctx,
       case DT_FLOAT:
         return MKLTransposeND<float>(ctx, in, out, perm);
         break;
-      case DT_BFLOAT16:
-        return MKLTransposeND<bfloat16>(ctx, in, out, perm);
-        break;
+      // TODO(nhasabni): Enable this case when we turn on bfloat16 compilation.
+      // case DT_BFLOAT16:
+      //  return MKLTransposeND<bfloat16>(ctx, in, out, perm);
+      //  break;
       // TODO(nhasabni): support other types such as INT8.
       default:
         break;
diff --git a/tensorflow/core/kernels/multinomial_op_gpu.cu.cc b/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
index 402146b7528..8143a033960 100644
--- a/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
@@ -109,7 +109,7 @@ struct MultinomialFunctor<GPUDevice, T, OutputType> {
     output.device(d) = output.constant(0LL);
 
     const int32 work_items = batch_size * num_samples * num_classes;
-    CudaLaunchConfig config = GetCudaLaunchConfig(work_items, d);
+    GpuLaunchConfig config = GetCudaLaunchConfig(work_items, d);
     TF_CHECK_OK(CudaLaunchKernel(
         MultinomialKernel<OutputType>, config.block_count,
         config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
diff --git a/tensorflow/core/kernels/mutex_ops.cc b/tensorflow/core/kernels/mutex_ops.cc
index 2f4a5e9aa03..0cc29b42d93 100644
--- a/tensorflow/core/kernels/mutex_ops.cc
+++ b/tensorflow/core/kernels/mutex_ops.cc
@@ -74,6 +74,8 @@ class Mutex : public ResourceBase {
   struct SharedLockReleaser {
     std::shared_ptr<LockReleaser> shared_lock;
 
+    SharedLockReleaser() : shared_lock() {}
+
     explicit SharedLockReleaser(std::shared_ptr<LockReleaser>&& lock)
         : shared_lock(std::forward<decltype(lock)>(lock)) {
       VLOG(3) << "Creating shared_ptr of " << shared_lock.get()
@@ -86,6 +88,16 @@ class Mutex : public ResourceBase {
               << " count is: " << shared_lock.use_count();
     }
 
+    SharedLockReleaser& operator=(const SharedLockReleaser& rhs) = delete;
+
+    SharedLockReleaser& operator=(SharedLockReleaser&& rhs) {
+      if (&rhs == this) return *this;
+      std::swap(shared_lock, rhs.shared_lock);
+      VLOG(3) << "Move-assign of SharedLockReleaser of " << shared_lock.get()
+              << " count is: " << shared_lock.use_count();
+      return *this;
+    }
+
     SharedLockReleaser(const SharedLockReleaser& rhs)
         : shared_lock(rhs.shared_lock) {
       VLOG(3) << "Copying SharedLockReleaser of " << shared_lock.get()
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
index ac85b1b55b0..65d653123b1 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
@@ -240,7 +240,7 @@ struct TruncatedNormalFunctor<GPUDevice, T> {
                   typename TTypes<T>::ConstFlat maxvals,
                   const random::PhiloxRandom& gen,
                   typename TTypes<T>::Flat output) {
-    const auto config = GetCudaLaunchConfig(num_elements, d);
+    const auto config = GetGpuLaunchConfig(num_elements, d);
 
     TF_CHECK_OK(CudaLaunchKernel(
         TruncatedNormalKernel<T>, config.block_count, config.thread_per_block,
diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index 03f72495450..2c3a8f76f0c 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
@@ -218,8 +219,12 @@ void PartitionedCallOp::RunFunction(FunctionLibraryRuntime::Handle handle,
                                     FunctionLibraryRuntime* lib,
                                     OpKernelContext* ctx, DoneCallback done) {
   FunctionLibraryRuntime::Options run_opts;
-  run_opts.step_id = ctx->step_id();
-  run_opts.step_container = ctx->step_container();
+  ResourceMgr* resource_mgr = lib->device()->resource_manager();
+  ScopedStepContainer* step_container = new ScopedStepContainer(
+      run_opts.step_id, [resource_mgr](const string& name) {
+        resource_mgr->Cleanup(name).IgnoreError();
+      });
+  run_opts.step_container = step_container;
   run_opts.cancellation_manager = ctx->cancellation_manager();
   run_opts.stats_collector = ctx->stats_collector();
   run_opts.collective_executor = ctx->collective_executor();
@@ -229,15 +234,20 @@ void PartitionedCallOp::RunFunction(FunctionLibraryRuntime::Handle handle,
   run_opts.source_device =
       lib->device() == nullptr ? "" : lib->device()->name();
   run_opts.allow_dead_tensors = true;
-  // TODO(akshayka): Accommodate the multiple-worker scenario by adding the
-  // constructed rendezvous to a rendezvous manager.
-  Rendezvous* rendez = new IntraProcessRendezvous(lib->device_mgr());
+
+  Rendezvous* rendez;
+  OP_REQUIRES_OK_ASYNC(
+      ctx,
+      ctx->create_rendezvous(run_opts.step_id,
+                             ctx->function_library()->device_mgr(), &rendez),
+      done);
   run_opts.rendezvous = rendez;
 
   std::vector<Tensor>* rets = new std::vector<Tensor>;
   const string& func_name = func_->name();
   lib->Run(run_opts, handle, inputs, rets,
-           [rets, rendez, done, ctx, func_name](const Status& status) {
+           [rets, rendez, done, ctx, func_name,
+            step_container](const Status& status) {
              if (!status.ok()) {
                const string function_and_msg =
                    strings::StrCat(errors::FormatFunctionForError(func_name),
@@ -249,6 +259,7 @@ void PartitionedCallOp::RunFunction(FunctionLibraryRuntime::Handle handle,
                }
              }
              delete rets;
+             delete step_container;
              rendez->Unref();
              done();
            });
diff --git a/tensorflow/core/kernels/pooling_ops_3d_gpu.cu.cc b/tensorflow/core/kernels/pooling_ops_3d_gpu.cu.cc
index 05810945442..1b28d8b5923 100644
--- a/tensorflow/core/kernels/pooling_ops_3d_gpu.cu.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d_gpu.cu.cc
@@ -142,7 +142,7 @@ bool MaxPool3dGradBackward<T>::operator()(
     const T* top_diff, T* bottom_diff, const Eigen::GpuDevice& d) {
   int num_kernels =
       batch * channels * pooled_plane * pooled_height * pooled_width;
-  CudaLaunchConfig config = GetCudaLaunchConfig(num_kernels, d);
+  GpuLaunchConfig config = GetCudaLaunchConfig(num_kernels, d);
   if (data_format == FORMAT_NHWC) {
     TF_CHECK_OK(CudaLaunchKernel(
         MaxPoolGradBackwardNoMaskNDHWC<T>, config.block_count,
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index 903cf9313a2..01a353cb175 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 
 #if GOOGLE_CUDA
-#include "cuda/include/cudnn.h"
+#include "third_party/gpus/cudnn/cudnn.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
diff --git a/tensorflow/core/kernels/pooling_ops_common_gpu.h b/tensorflow/core/kernels/pooling_ops_common_gpu.h
index 7362c5275f7..9685bd9fdd0 100644
--- a/tensorflow/core/kernels/pooling_ops_common_gpu.h
+++ b/tensorflow/core/kernels/pooling_ops_common_gpu.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if !GOOGLE_CUDA
-#error This file must only be included when building with Cuda support
+#if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM
+#error This file must only be included when building with Cuda or ROCm support
 #endif
 
 #ifndef TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_GPU_H_
diff --git a/tensorflow/core/kernels/population_count_op_gpu.cu.cc b/tensorflow/core/kernels/population_count_op_gpu.cu.cc
index dc4f3564f0d..22beadfe61a 100644
--- a/tensorflow/core/kernels/population_count_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/population_count_op_gpu.cu.cc
@@ -69,7 +69,7 @@ __global__ void PopulationCountKernel<int64>(const int size, const int64* input,
       TTypes<uint8>::Flat output) {                                            \
     const GPUDevice& d = c->eigen_device<GPUDevice>();                         \
     int64 total_count = input.size();                                          \
-    CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);             \
+    GpuLaunchConfig config = GetCudaLaunchConfig(total_count, d);              \
     TF_CHECK_OK(CudaLaunchKernel(PopulationCountKernel<T>, config.block_count, \
                                  config.thread_per_block, 0, d.stream(),       \
                                  total_count, input.data(), output.data()));   \
diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index b7d87b2445d..a2614bfc63c 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -28,7 +28,7 @@ limitations under the License.
 #include "third_party/cub/iterator/counting_input_iterator.cuh"
 #include "third_party/cub/iterator/transform_input_iterator.cuh"
 #include "third_party/cub/warp/warp_reduce.cuh"
-#include "cuda/include/cuComplex.h"
+#include "third_party/gpus/cuda/include/cuComplex.h"
 #include "tensorflow/core/kernels/reduction_ops.h"
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/util/gpu_device_functions.h"
diff --git a/tensorflow/core/kernels/redux_functor.h b/tensorflow/core/kernels/redux_functor.h
index ee60008f11d..05a867ab007 100644
--- a/tensorflow/core/kernels/redux_functor.h
+++ b/tensorflow/core/kernels/redux_functor.h
@@ -16,7 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_REDUX_FUNCTOR_H_
 #define TENSORFLOW_CORE_KERNELS_REDUX_FUNCTOR_H_
 
+#define EIGEN_USE_THREADS
+
 #include "third_party/eigen3/Eigen/Core"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/core/kernels/relu_op_gpu.cu.cc b/tensorflow/core/kernels/relu_op_gpu.cu.cc
index 2fc56d077b0..2ade89b7ff5 100644
--- a/tensorflow/core/kernels/relu_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/relu_op_gpu.cu.cc
@@ -104,7 +104,7 @@ struct ReluGrad<Device, Eigen::half> {
     if (count == 0) return;
     int32 half2_count = Eigen::divup(count, 2);
     constexpr int32 kThreadInBlock = 512;
-    CudaLaunchConfig config = GetCudaLaunchConfigFixedBlockSize(
+    GpuLaunchConfig config = GetCudaLaunchConfigFixedBlockSize(
         half2_count, d, ReluGradHalfKernel, 0, kThreadInBlock);
     TF_CHECK_OK(CudaLaunchKernel(
         ReluGradHalfKernel, config.block_count, config.thread_per_block, 0,
@@ -133,7 +133,7 @@ struct Relu<Device, qint8> {
 
     int32 vect_count = Eigen::divup(count, 4);
     constexpr int32 kThreadInBlock = 512;
-    CudaLaunchConfig config = GetCudaLaunchConfigFixedBlockSize(
+    GpuLaunchConfig config = GetCudaLaunchConfigFixedBlockSize(
         vect_count, d, Relu_int8x4_kernel, 0, kThreadInBlock);
     TF_CHECK_OK(CudaLaunchKernel(
         Relu_int8x4_kernel, config.block_count, config.thread_per_block, 0,
diff --git a/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc b/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
index 06c7526e597..7c8ac7db359 100644
--- a/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
@@ -278,7 +278,7 @@ struct ResizeBilinear<GPUDevice, T> {
     const int total_count = batch * out_height * out_width * channels;
     if (total_count == 0) return;
 
-    CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
+    GpuLaunchConfig config = GetCudaLaunchConfig(total_count, d);
     if (half_pixel_centers) {
       TF_CHECK_OK(CudaLaunchKernel(
           ResizeBilinearKernel<T>, config.block_count, config.thread_per_block,
@@ -312,19 +312,19 @@ struct ResizeBilinearGrad<GPUDevice, T> {
     const int resized_width = input_grad.dimension(2);
 
     int total_count;
-    CudaLaunchConfig config;
+    GpuLaunchConfig config;
 
     // Initialize output_grad with all zeros.
     total_count = batch * original_height * original_width * channels;
     if (total_count == 0) return;
-    config = GetCudaLaunchConfig(total_count, d);
+    config = GetGpuLaunchConfig(total_count, d);
     TF_CHECK_OK(CudaLaunchKernel(
         SetZero<T>, config.block_count, config.thread_per_block, 0, d.stream(),
         config.virtual_thread_count, output_grad.data()));
 
     // Accumulate.
     total_count = batch * resized_height * resized_width * channels;
-    config = GetCudaLaunchConfig(total_count, d);
+    config = GetGpuLaunchConfig(total_count, d);
     if (half_pixel_centers) {
       TF_CHECK_OK(CudaLaunchKernel(
           ResizeBilinearGradKernel<T>, config.block_count,
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc b/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc
index 8bcec43a594..5ae1bfc92e1 100644
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc
@@ -172,7 +172,7 @@ struct ResizeNearestNeighbor<GPUDevice, T, half_pixel_centers, align_corners> {
     const int output_size = batch_size * out_height * out_width * channels;
     if (output_size == 0) return true;
 
-    CudaLaunchConfig config = GetCudaLaunchConfig(output_size, d);
+    GpuLaunchConfig config = GetCudaLaunchConfig(output_size, d);
     if (half_pixel_centers) {
       TF_CHECK_OK(CudaLaunchKernel(
           ResizeNearestNeighborNHWC<T>, config.block_count,
@@ -218,7 +218,7 @@ struct ResizeNearestNeighborGrad<GPUDevice, T, half_pixel_centers,
 
     const int output_size = batch_size * channels * out_height * out_width;
 
-    CudaLaunchConfig output_config = GetCudaLaunchConfig(output_size, d);
+    GpuLaunchConfig output_config = GetCudaLaunchConfig(output_size, d);
     TF_CHECK_OK(CudaLaunchKernel(SetZero<T>, output_config.block_count,
                                  output_config.thread_per_block, 0, d.stream(),
                                  output_size, output.data()));
@@ -227,7 +227,7 @@ struct ResizeNearestNeighborGrad<GPUDevice, T, half_pixel_centers,
     const int input_size = batch_size * channels * in_height * in_width;
     if (input_size == 0) return true;
 
-    CudaLaunchConfig input_config = GetCudaLaunchConfig(input_size, d);
+    GpuLaunchConfig input_config = GetCudaLaunchConfig(input_size, d);
     if (half_pixel_centers) {
       TF_CHECK_OK(CudaLaunchKernel(
           ResizeNearestNeighborBackwardNHWC<T>, input_config.block_count,
diff --git a/tensorflow/core/kernels/scan_ops_gpu.h b/tensorflow/core/kernels/scan_ops_gpu.h
index c23d31e2b30..331aeca6a77 100644
--- a/tensorflow/core/kernels/scan_ops_gpu.h
+++ b/tensorflow/core/kernels/scan_ops_gpu.h
@@ -29,7 +29,7 @@ limitations under the License.
 #include "third_party/cub/block/block_store.cuh"
 #include "third_party/cub/iterator/counting_input_iterator.cuh"
 #include "third_party/cub/iterator/transform_input_iterator.cuh"
-#include "cuda/include/cuComplex.h"
+#include "third_party/gpus/cuda/include/cuComplex.h"
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/scan_ops.h"
diff --git a/tensorflow/core/kernels/scatter_functor_gpu.cu.h b/tensorflow/core/kernels/scatter_functor_gpu.cu.h
index c3a3d0a6f7f..6c195e59e20 100644
--- a/tensorflow/core/kernels/scatter_functor_gpu.cu.h
+++ b/tensorflow/core/kernels/scatter_functor_gpu.cu.h
@@ -126,7 +126,7 @@ struct ScatterFunctor<GPUDevice, T, Index, op> {
     const Index first_dim_size = params.dimension(0);
     const Index indices_size = indices.size();
     const Index updates_size = updates.size();
-    CudaLaunchConfig config = GetCudaLaunchConfig(updates_size, d);
+    GpuLaunchConfig config = GetCudaLaunchConfig(updates_size, d);
     TF_CHECK_OK(CudaLaunchKernel(
         scatter_op_gpu::ScatterOpCustomKernel<T, Index, op>, config.block_count,
         config.thread_per_block, 0, d.stream(), params.data(), updates.data(),
@@ -147,7 +147,7 @@ struct ScatterScalarFunctor<GPUDevice, T, Index, op> {
     const Index first_dim_size = params.dimension(0);
     const Index indices_size = indices.size();
     const Index synthesized_updates_size = indices_size * params.dimension(1);
-    CudaLaunchConfig config = GetCudaLaunchConfig(synthesized_updates_size, d);
+    GpuLaunchConfig config = GetCudaLaunchConfig(synthesized_updates_size, d);
     TF_CHECK_OK(CudaLaunchKernel(
         scatter_op_gpu::ScatterScalarOpCustomKernel<T, Index, op>,
         config.block_count, config.thread_per_block, 0, d.stream(),
diff --git a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
index 358302ea7d2..9152e71acb2 100644
--- a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
@@ -135,7 +135,7 @@ struct ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM> {
       }
     }
 
-    CudaLaunchConfig config = GetCudaLaunchConfig(Toutput.size(), d);
+    GpuLaunchConfig config = GetCudaLaunchConfig(Toutput.size(), d);
 
     TF_CHECK_OK(CudaLaunchKernel(ScatterNdOpKernel<T, Index, op, IXDIM>,
                                  config.block_count, config.thread_per_block, 0,
diff --git a/tensorflow/core/kernels/searchsorted_op_gpu.cu.cc b/tensorflow/core/kernels/searchsorted_op_gpu.cu.cc
index def7f3a77da..bd20793b078 100644
--- a/tensorflow/core/kernels/searchsorted_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/searchsorted_op_gpu.cu.cc
@@ -64,8 +64,8 @@ struct UpperBoundFunctor<GPUDevice, T, OutType> {
                         int batch_size, int num_inputs, int num_values,
                         typename TTypes<OutType, 1>::Tensor* output) {
     const cudaStream_t& stream = GetCudaStream(context);
-    CudaLaunchConfig config =
-        GetCudaLaunchConfig(values.size(), context->eigen_gpu_device());
+    GpuLaunchConfig config =
+        GetGpuLaunchConfig(values.size(), context->eigen_gpu_device());
 
     TF_CHECK_OK(CudaLaunchKernel(
         UpperBoundKernel<T, OutType>, config.block_count,
@@ -84,8 +84,8 @@ struct LowerBoundFunctor<GPUDevice, T, OutType> {
                         int batch_size, int num_inputs, int num_values,
                         typename TTypes<OutType, 1>::Tensor* output) {
     const cudaStream_t& stream = GetCudaStream(context);
-    CudaLaunchConfig config =
-        GetCudaLaunchConfig(values.size(), context->eigen_gpu_device());
+    GpuLaunchConfig config =
+        GetGpuLaunchConfig(values.size(), context->eigen_gpu_device());
 
     TF_CHECK_OK(CudaLaunchKernel(
         LowerBoundKernel<T, OutType>, config.block_count,
diff --git a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
index dd55b5cf92a..305673b56fc 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
@@ -138,7 +138,7 @@ void SegmentSumFunctor<T, Index>::operator()(
     return;
   }
   // Set 'output' to zeros.
-  CudaLaunchConfig config = GetCudaLaunchConfig(output.size(), d);
+  GpuLaunchConfig config = GetCudaLaunchConfig(output.size(), d);
   TF_CHECK_OK(CudaLaunchKernel(SetZero<T>, config.block_count,
                                config.thread_per_block, 0, d.stream(),
                                output.size(), output.data()));
@@ -163,7 +163,7 @@ void SegmentSumFunctor<T, Index>::operator()(
   const Index total_stripe_count =
       input_inner_dim_size * input_outer_dim_num_stripe;
 
-  config = GetCudaLaunchConfig(total_stripe_count, d);
+  config = GetGpuLaunchConfig(total_stripe_count, d);
   TF_CHECK_OK(CudaLaunchKernel(
       SortedSegmentSumCustomKernel<T, Index, OuterDimTileSize>,
       config.block_count, config.thread_per_block, 0, d.stream(),
@@ -184,7 +184,7 @@ struct UnsortedSegmentFunctor<GPUDevice, T, Index, InitialValueF, ReductionF> {
     }
     // Set 'output' to initial value.
     GPUDevice d = ctx->template eigen_device<GPUDevice>();
-    CudaLaunchConfig config = GetCudaLaunchConfig(output.size(), d);
+    GpuLaunchConfig config = GetCudaLaunchConfig(output.size(), d);
     TF_CHECK_OK(CudaLaunchKernel(
         SetToValue<T>, config.block_count, config.thread_per_block, 0,
         d.stream(), output.size(), output.data(), InitialValueF()()));
@@ -198,7 +198,7 @@ struct UnsortedSegmentFunctor<GPUDevice, T, Index, InitialValueF, ReductionF> {
     // *) 'input_outer_dim_size' is the total number of segments to process.
     const Index input_outer_dim_size = segment_ids.dimension(0);
     const Index input_inner_dim_size = data_size / input_outer_dim_size;
-    config = GetCudaLaunchConfig(data_size, d);
+    config = GetGpuLaunchConfig(data_size, d);
 
     TF_CHECK_OK(CudaLaunchKernel(
         UnsortedSegmentCustomKernel<T, Index, ReductionF>, config.block_count,
diff --git a/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc b/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
index 7ff9fa711e7..4db5c6f30ad 100644
--- a/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
@@ -138,8 +138,8 @@ struct SpaceToBatchFunctor<GPUDevice, T, NUM_BLOCK_DIMS, B2S> {
       return errors::InvalidArgument(
           "number of batch_tensor elements exceeds 2^32-1");
     }
-    CudaLaunchConfig config =
-        GetCudaLaunchConfig(static_cast<int32>(total_count), d);
+    GpuLaunchConfig config =
+        GetGpuLaunchConfig(static_cast<int32>(total_count), d);
     return CudaLaunchKernel(S2B<T, NUM_BLOCK_DIMS, B2S>, config.block_count,
                             config.thread_per_block, 0, d.stream(),
                             config.virtual_thread_count,
diff --git a/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc b/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
index e01f05dff0a..55573208540 100644
--- a/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
@@ -156,7 +156,7 @@ struct SpaceToDepthOpFunctor<GPUDevice, T, FORMAT_NHWC> {
     if (total_count == 0) {
       return;
     }
-    CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
+    GpuLaunchConfig config = GetCudaLaunchConfig(total_count, d);
     TF_CHECK_OK(CudaLaunchKernel(
         S2D_NHWC<T>, config.block_count, config.thread_per_block, 0, d.stream(),
         config.virtual_thread_count, input.data(), block_size, batch_size,
@@ -190,7 +190,7 @@ struct SpaceToDepthOpFunctor<GPUDevice, T, FORMAT_NCHW> {
       if (total_count == 0) {
         return;
       }
-      CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
+      GpuLaunchConfig config = GetCudaLaunchConfig(total_count, d);
       switch (block_size) {
         case 2:
           TF_CHECK_OK(CudaLaunchKernel(
@@ -221,7 +221,7 @@ struct SpaceToDepthOpFunctor<GPUDevice, T, FORMAT_NCHW> {
     if (total_count == 0) {
       return;
     }
-    CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
+    GpuLaunchConfig config = GetCudaLaunchConfig(total_count, d);
     TF_CHECK_OK(CudaLaunchKernel(
         S2D_NCHW<T>, config.block_count, config.thread_per_block, 0, d.stream(),
         config.virtual_thread_count, input.data(), block_size, output_width,
diff --git a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
index 9e31f12350e..2b00549a9ea 100644
--- a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
@@ -78,7 +78,7 @@ struct SparseTensorDenseMatMulFunctor<GPUDevice, T, Tindices, ADJ_A, ADJ_B> {
 
     // TODO(ebrevdo): Should this be alpha * nnz instead of
     // out.size()?  Perhaps p * nnz ?
-    CudaLaunchConfig config = GetCudaLaunchConfig(p * nnz, d);
+    GpuLaunchConfig config = GetCudaLaunchConfig(p * nnz, d);
 
     TF_CHECK_OK(CudaLaunchKernel(
         SparseTensorDenseMatMulKernel<T, Tindices, ADJ_A, ADJ_B>,
diff --git a/tensorflow/core/kernels/split_lib_gpu.cu.cc b/tensorflow/core/kernels/split_lib_gpu.cu.cc
index 649bb29bab4..368239477b1 100644
--- a/tensorflow/core/kernels/split_lib_gpu.cu.cc
+++ b/tensorflow/core/kernels/split_lib_gpu.cu.cc
@@ -198,7 +198,7 @@ void SplitOpGPULaunch<T>::Run(const Eigen::GpuDevice& d, const T* input,
                               int32 prefix_dim_size, int32 split_dim_size,
                               int32 suffix_dim_size,
                               const GpuDeviceArrayStruct<T*>& output_ptr_data) {
-  CudaLaunchConfig config = GetCudaLaunchConfig(
+  GpuLaunchConfig config = GetCudaLaunchConfig(
       prefix_dim_size * split_dim_size * suffix_dim_size, d);
 
   TF_CHECK_OK(CudaLaunchKernel(SplitOpKernel<T>, config.block_count,
@@ -214,8 +214,8 @@ void SplitVOpGPULaunch<T, IntType>::Run(
     const GpuDeviceArrayStruct<IntType>& output_scan,
     const GpuDeviceArrayStruct<T*>& output_ptr_data) {
   if (fixed_size) {
-    CudaLaunchConfig config =
-        GetCudaLaunchConfig(total_rows * total_cols, gpu_device);
+    GpuLaunchConfig config =
+        GetGpuLaunchConfig(total_rows * total_cols, gpu_device);
 
     TF_CHECK_OK(CudaLaunchKernel(SplitVOpKernel_fixed<T>, config.block_count,
                                  config.thread_per_block, 0,
diff --git a/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc b/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc
index 9aa73189f0d..8d6e826d625 100644
--- a/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc
@@ -70,8 +70,8 @@ void UpdateVariableAndFill_Philox<GPUDevice, Distribution>::operator()(
   // maximize occupancy
   const int kGroupSize = Distribution::kResultElementCount;
   int work_element_count = (output_size + kGroupSize - 1) / kGroupSize;
-  CudaLaunchConfig cfg = GetCudaLaunchConfig(work_element_count, d,
-                                             FillKernel<Distribution>, 0, 0);
+  GpuLaunchConfig cfg = GetCudaLaunchConfig(work_element_count, d,
+                                            FillKernel<Distribution>, 0, 0);
 
   int zero = 0;
   cudaMemcpyToSymbol(thread_counter, &zero, sizeof(int));
diff --git a/tensorflow/core/kernels/svd_op_gpu.cu.cc b/tensorflow/core/kernels/svd_op_gpu.cu.cc
index 418d80fc071..3f51820cd55 100644
--- a/tensorflow/core/kernels/svd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/svd_op_gpu.cu.cc
@@ -74,7 +74,7 @@ __global__ void ComputeValueOfVKernel(Cuda2DLaunchConfig config, int64 m,
 // Extracts the sign of V
 // V[i] = V[i]>=0 ? 1 : 0
 template <class Scalar>
-__global__ void ExtractSignOfVKernel(CudaLaunchConfig config, Scalar* V) {
+__global__ void ExtractSignOfVKernel(GpuLaunchConfig config, Scalar* V) {
   CUDA_1D_KERNEL_LOOP(i, config.virtual_thread_count) {
     V[i] = V[i] >= 0 ? Scalar(1) : Scalar(-1);
   }
@@ -202,7 +202,7 @@ class SvdOpGpu : public AsyncOpKernel {
                                    input_copy.flat<Scalar>().data(),
                                    outputU_ptr, outputS_ptr, outputV_ptr));
       // 2. clamp V to -1 or +1
-      CudaLaunchConfig cfg1D = GetCudaLaunchConfig(batch_size, d);
+      GpuLaunchConfig cfg1D = GetCudaLaunchConfig(batch_size, d);
       TF_CHECK_OK(CudaLaunchKernel(ExtractSignOfVKernel<Scalar>,
                                    cfg1D.block_count, cfg1D.thread_per_block, 0,
                                    d.stream(), cfg1D, outputV_ptr));
diff --git a/tensorflow/core/kernels/tile_functor_gpu.h b/tensorflow/core/kernels/tile_functor_gpu.h
index 287b0063922..7d45a9843fd 100644
--- a/tensorflow/core/kernels/tile_functor_gpu.h
+++ b/tensorflow/core/kernels/tile_functor_gpu.h
@@ -73,7 +73,7 @@ void TileSimple(const Eigen::GpuDevice& d, Tensor* out, const Tensor& in) {
   // Launch kernel to q[...] = p[...].
   const T* p = in.flat<T>().data();
   T* q = out->flat<T>().data();
-  CudaLaunchConfig cfg = GetCudaLaunchConfig(out_nelem, d);
+  GpuLaunchConfig cfg = GetCudaLaunchConfig(out_nelem, d);
   TF_CHECK_OK(
       CudaLaunchKernel(TileKernel<T>, cfg.block_count, cfg.thread_per_block, 0,
                        d.stream(), cfg.virtual_thread_count, p,
diff --git a/tensorflow/core/kernels/transpose_functor_gpu.cu.cc b/tensorflow/core/kernels/transpose_functor_gpu.cu.cc
index fe2be013356..aa9e7196223 100644
--- a/tensorflow/core/kernels/transpose_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/transpose_functor_gpu.cu.cc
@@ -79,7 +79,7 @@ void TransposeSimple(const GPUDevice& d, const Tensor& in,
   // Launch kernel to q[...] = p[...].
   const T* p = reinterpret_cast<const T*>(in.tensor_data().data());
   T* q = reinterpret_cast<T*>(const_cast<char*>((out->tensor_data().data())));
-  CudaLaunchConfig cfg = GetCudaLaunchConfig(nelem, d);
+  GpuLaunchConfig cfg = GetCudaLaunchConfig(nelem, d);
   TF_CHECK_OK(CudaLaunchKernel(
       TransposeKernel<T, conjugate>, cfg.block_count, cfg.thread_per_block, 0,
       d.stream(), cfg.virtual_thread_count, p,
diff --git a/tensorflow/core/kernels/tridiagonal_solve_op_gpu.cu.cc b/tensorflow/core/kernels/tridiagonal_solve_op_gpu.cu.cc
index 6c6b9ab265e..9a48e9042bc 100644
--- a/tensorflow/core/kernels/tridiagonal_solve_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/tridiagonal_solve_op_gpu.cu.cc
@@ -205,7 +205,7 @@ class TridiagonalSolveOpGpuLinalg : public LinearAlgebraOp<Scalar> {
   void SolveForSizeOneOrTwo(OpKernelContext* context, const Scalar* diagonals,
                             const Scalar* rhs, Scalar* output, int m, int k) {
     const Eigen::GpuDevice& device = context->eigen_device<Eigen::GpuDevice>();
-    CudaLaunchConfig cfg = GetCudaLaunchConfig(1, device);
+    GpuLaunchConfig cfg = GetCudaLaunchConfig(1, device);
     bool* not_invertible_dev;
     cudaMalloc(&not_invertible_dev, sizeof(bool));
     TF_CHECK_OK(CudaLaunchKernel(SolveForSizeOneOrTwoKernel<Scalar>,
diff --git a/tensorflow/core/kernels/where_op_gpu.cu.h b/tensorflow/core/kernels/where_op_gpu.cu.h
index b87926cebca..c4895cb95b5 100644
--- a/tensorflow/core/kernels/where_op_gpu.cu.h
+++ b/tensorflow/core/kernels/where_op_gpu.cu.h
@@ -323,7 +323,7 @@ struct Where<GPUDevice, NDIM, T, TIndex> {
     const Eigen::array<TIndex, NDIM> strides =
         CalculateStrides<TIndex, T, NDIM>(input);
     const TIndex output_rows = output.dimension(0);
-    CudaLaunchConfig config = GetCudaLaunchConfig(output_rows, d);
+    GpuLaunchConfig config = GetCudaLaunchConfig(output_rows, d);
     TF_CHECK_OK(CudaLaunchKernel(PropagateWhereIndicesKernel<NDIM, TIndex>,
                                  config.block_count, config.thread_per_block, 0,
                                  d.stream(), output_rows, strides,
diff --git a/tensorflow/core/lib/io/zlib_inputstream.cc b/tensorflow/core/lib/io/zlib_inputstream.cc
index d069db6d20b..a489d2e9d50 100644
--- a/tensorflow/core/lib/io/zlib_inputstream.cc
+++ b/tensorflow/core/lib/io/zlib_inputstream.cc
@@ -197,24 +197,21 @@ Status ZlibInputStream::ReadNBytes(int64 bytes_to_read, string* result) {
 
     // Now that the cache is empty we need to inflate more data.
 
-    // Step 1. Fill up input buffer.
-    // We read from stream only after the previously read contents have been
-    // completely consumed. This is an optimization and can be removed if
-    // it causes problems. `ReadFromStream` is capable of handling partially
-    // filled up buffers.
-    if (z_stream_def_->stream->avail_in == 0) {
-      TF_RETURN_IF_ERROR(ReadFromStream());
-    }
-
-    // Step 2. Setup output stream.
+    // Step 1. Setup output stream.
     z_stream_def_->stream->next_out = z_stream_def_->output.get();
     next_unread_byte_ = reinterpret_cast<char*>(z_stream_def_->output.get());
     z_stream_def_->stream->avail_out = output_buffer_capacity_;
 
-    // Step 3. Inflate Inflate Inflate!
+    // Step 2. Try to inflate some input data.
     TF_RETURN_IF_ERROR(Inflate());
 
-    bytes_to_read -= ReadBytesFromCache(bytes_to_read, result);
+    // Step 3. Read any data produced by inflate. If no progress was made by
+    // inflate, read more compressed data from the input stream.
+    if (NumUnreadBytes() == 0) {
+      TF_RETURN_IF_ERROR(ReadFromStream());
+    } else {
+      bytes_to_read -= ReadBytesFromCache(bytes_to_read, result);
+    }
   }
 
   return Status::OK();
@@ -224,7 +221,11 @@ int64 ZlibInputStream::Tell() const { return bytes_read_; }
 
 Status ZlibInputStream::Inflate() {
   int error = inflate(z_stream_def_->stream.get(), zlib_options_.flush_mode);
-  if (error != Z_OK && error != Z_STREAM_END) {
+  // Source: http://zlib.net/manual.html
+  // Z_BUF_ERROR: `inflate` returns Z_BUF_ERROR if no progress was made. This is
+  // not fatal and `inflate` can be called again with more input and output
+  // space to continue inflating.
+  if (error != Z_OK && error != Z_STREAM_END && error != Z_BUF_ERROR) {
     string error_string =
         strings::StrCat("inflate() failed with error ", error);
     if (z_stream_def_->stream->msg != nullptr) {
diff --git a/tensorflow/core/lib/monitoring/sampler.cc b/tensorflow/core/lib/monitoring/sampler.cc
index b4f577544ad..20c5f1a73fe 100644
--- a/tensorflow/core/lib/monitoring/sampler.cc
+++ b/tensorflow/core/lib/monitoring/sampler.cc
@@ -96,6 +96,12 @@ class ExponentialBuckets : public Buckets {
 
 }  // namespace
 
+// static
+std::unique_ptr<Buckets> Buckets::Explicit(std::vector<double> bucket_limits) {
+  return std::unique_ptr<Buckets>(
+      new ExplicitBuckets(std::move(bucket_limits)));
+}
+
 // static
 std::unique_ptr<Buckets> Buckets::Explicit(
     std::initializer_list<double> bucket_limits) {
diff --git a/tensorflow/core/lib/monitoring/sampler.h b/tensorflow/core/lib/monitoring/sampler.h
index d37da62b582..c6f32d46fa2 100644
--- a/tensorflow/core/lib/monitoring/sampler.h
+++ b/tensorflow/core/lib/monitoring/sampler.h
@@ -96,6 +96,11 @@ class Buckets {
   static std::unique_ptr<Buckets> Explicit(
       std::initializer_list<double> bucket_limits);
 
+  // This alternative Explicit Buckets factory method is primarily meant to be
+  // used by the CLIF layer code paths that are incompatible with
+  // initialize_lists.
+  static std::unique_ptr<Buckets> Explicit(std::vector<double> bucket_limits);
+
   virtual const std::vector<double>& explicit_bounds() const = 0;
 };
 
diff --git a/tensorflow/core/lib/monitoring/sampler_test.cc b/tensorflow/core/lib/monitoring/sampler_test.cc
index 05578357fe0..8be15f92185 100644
--- a/tensorflow/core/lib/monitoring/sampler_test.cc
+++ b/tensorflow/core/lib/monitoring/sampler_test.cc
@@ -61,7 +61,7 @@ TEST(LabeledSamplerTest, ExplicitBucketBoundaries) {
 auto* init_sampler_without_labels =
     Sampler<0>::New({"/tensorflow/test/init_sampler_without_labels",
                      "Sampler without labels initialized as empty."},
-                    Buckets::Explicit({1.5, 2.8}));
+                    Buckets::Explicit(std::vector<double>{1.5, 2.8}));
 
 TEST(UnlabeledSamplerTest, InitializedEmpty) {
   Histogram empty;
@@ -112,7 +112,7 @@ TEST(ExponentialSamplerTest, ExponentialBucketBoundaries) {
   EqHistograms(expected, cell->value());
 }
 
-TEST(ExponentialSamplerTest, SameName) {
+TEST(ExplicitSamplerTest, SameName) {
   auto* same_sampler = Sampler<1>::New({"/tensorflow/test/sampler_with_labels",
                                         "Sampler with one label.", "MyLabel"},
                                        Buckets::Explicit({10.0, 20.0}));
diff --git a/tensorflow/core/nccl/nccl_manager.cc b/tensorflow/core/nccl/nccl_manager.cc
index e76ce97e631..ca6e64c34d1 100644
--- a/tensorflow/core/nccl/nccl_manager.cc
+++ b/tensorflow/core/nccl/nccl_manager.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #ifdef GOOGLE_CUDA
 
+#include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/cuda.h"
 #include "tensorflow/core/platform/env.h"
@@ -44,16 +45,10 @@ using se::cuda::ScopedActivateExecutorContext;
 
 // Contains data for a single stream used for nccl communication; this includes
 // a background thread that calls NcclManager::LoopKernelLaunches.
-struct NcclManager::NcclStream {
+struct NcclManager::NcclStream : public core::RefCounted {
  public:
-  NcclStream() {}
-  ~NcclStream() {
-    VLOG(2) << "Entered ~NcclStream " << this;
-    mutex_lock l(mu);
-    shutdown_requested = true;
-    cv.notify_all();
-    VLOG(2) << "Done ~NcclStream " << this;
-  }
+  NcclStream() = default;
+  ~NcclStream() = default;
 
   se::StreamExecutor* executor = nullptr;
 
@@ -61,11 +56,13 @@ struct NcclManager::NcclStream {
   // This is a different stream than the tensorflow compute stream.
   std::unique_ptr<se::Stream> stream;
 
-  // See NcclManager::LoopKernelLaunches for information on these.
-  std::unique_ptr<Thread> thread;
+  // `mu` protects access to `pending_launches_`, which is the list of
+  // collectives ready but whose kernels are yet to be launched.  When the
+  // NcclManager object that owns this NcclStream object is destroyed, it
+  // signals `cv` to unblock the thread waiting on more collectives.
   mutex mu;
   condition_variable cv;
-  // Has collective,participant_idx pairs.
+  // Has (collective, participant_idx) pairs.
   std::deque<std::pair<Collective*, int>> pending_launches_ GUARDED_BY(mu);
   bool shutdown_requested GUARDED_BY(mu) = false;
 };
@@ -78,7 +75,7 @@ struct NcclManager::CommunicatorMember {
   }
 
   ncclComm_t nccl_comm = nullptr;
-  // Owned by NcclManager::device_to_comm_streams_.
+  // Owned by NcclManager::device_to_comm_streams_ and LoopKernelLaunches.
   NcclStream* nccl_stream = nullptr;
 };
 
@@ -129,7 +126,7 @@ void StringToNcclUniqueId(const string& str_id, ncclUniqueId* nccl_id) {
 // have a single `Collective` per step.  However, a collective that executes on
 // 3 nodes with 4 GPUs each would have a `Collective` per node, each of which is
 // tracking the 4 GPUs local to that node.
-struct NcclManager::Collective {
+struct NcclManager::Collective : public core::RefCounted {
   Collective(DataType data_type_in, CollectiveType type_in,
              ncclRedOp_t reduction_op_in, int num_local_devices_in,
              int num_global_devices_in, const string& communicator_key_in)
@@ -139,8 +136,7 @@ struct NcclManager::Collective {
         num_local_devices(num_local_devices_in),
         num_global_devices(num_global_devices_in),
         single_node(num_local_devices_in == num_global_devices_in),
-        communicator_key(communicator_key_in),
-        remaining_participants(num_local_devices_in) {
+        communicator_key(communicator_key_in) {
     participants.reserve(num_local_devices_in);
   }
 
@@ -176,13 +172,23 @@ struct NcclManager::Collective {
   int available_participants = 0;
   bool multi_node_ready = false;
 
-  mutable std::atomic_int_fast32_t remaining_participants;
-
   Status status;
 };
 
 NcclManager::NcclManager() { VLOG(2) << "New NcclManager " << this; }
-NcclManager::~NcclManager() { VLOG(2) << "~NcclManager " << this; }
+NcclManager::~NcclManager() {
+  VLOG(2) << "~NcclManager " << this;
+  for (auto& it : device_to_comm_streams_) {
+    for (NcclStream* nccl_stream : it.second) {
+      {
+        mutex_lock l(nccl_stream->mu);
+        nccl_stream->shutdown_requested = true;
+        nccl_stream->cv.notify_all();
+      }
+      nccl_stream->Unref();
+    }
+  }
+}
 NcclManager* NcclManager::instance() {
   static NcclManager* instance = new NcclManager();
   return instance;
@@ -278,8 +284,8 @@ Status NcclManager::GetCommunicator(NcclManager::Collective* collective,
     auto& streams = device_to_comm_streams_[executor];
     NcclStream* nccl_stream = nullptr;
     for (const auto& s : streams) {
-      if (used_streams.insert(s.get()).second) {
-        nccl_stream = s.get();
+      if (used_streams.insert(s).second) {
+        nccl_stream = s;
         break;
       }
     }
@@ -292,9 +298,11 @@ Status NcclManager::GetCommunicator(NcclManager::Collective* collective,
       streams.emplace_back(nccl_stream);
       used_streams.insert(nccl_stream);
 
-      nccl_stream->thread.reset(env->StartThread(
-          ThreadOptions(), "nccl_kernel_launch",
-          [this, nccl_stream] { LoopKernelLaunches(nccl_stream); }));
+      nccl_stream->Ref();
+      env->SchedClosure([this, nccl_stream]() {
+        LoopKernelLaunches(nccl_stream);
+        nccl_stream->Unref();
+      });
     }
 
     members[i].nccl_stream = nccl_stream;
@@ -386,9 +394,11 @@ void NcclManager::SignalMultiNodeReady(const string& collective_key) {
     mutex_lock l(mu_);
     auto collective_it = collectives_.find(collective_key);
     if (collective_it != collectives_.end()) {
-      Collective* collective = collective_it->second.get();
+      Collective* collective = collective_it->second;
       collective->multi_node_ready = true;
-      to_run = CheckReady(collective_key, collective);
+      if (CheckReady(collective_key, collective)) {
+        to_run = collective;
+      }
     }
   }
 
@@ -406,14 +416,12 @@ void NcclManager::AddParticipant(std::unique_ptr<Participant> participant,
     auto collective_it = collectives_.find(context.collective_key);
     Collective* collective = nullptr;
     if (collective_it == collectives_.end()) {
-      auto collective_unique_ptr = absl::make_unique<Collective>(
+      collective = new Collective(
           data_type, collective_type, reduction_op, context.num_local_devices,
           context.num_global_devices, context.communicator_key);
-      collective = collective_unique_ptr.get();
-      collectives_.emplace(context.collective_key,
-                           std::move(collective_unique_ptr));
+      collectives_.emplace(context.collective_key, collective);
     } else {
-      collective = collective_it->second.get();
+      collective = collective_it->second;
     }
 
     // Check `collective` is correct and consistent.
@@ -467,26 +475,25 @@ void NcclManager::AddParticipant(std::unique_ptr<Participant> participant,
     collective->participants.emplace_back(std::move(participant));
     ++collective->available_participants;
 
-    to_run = CheckReady(context.collective_key, collective);
+    if (CheckReady(context.collective_key, collective)) {
+      to_run = collective;
+    }
   }
 
   if (to_run != nullptr) RunCollective(to_run);
 }
 
-NcclManager::Collective* NcclManager::CheckReady(const string& collective_key,
-                                                 Collective* collective) {
-  Collective* to_run = nullptr;
+bool NcclManager::CheckReady(const string& collective_key,
+                             Collective* collective) {
   if (collective->available_participants == collective->num_local_devices) {
     if (collective->num_global_devices == collective->num_local_devices ||
         collective->multi_node_ready) {
       // Ownership transferred to callee.
-      to_run = collective;
-      auto collectives_it = collectives_.find(collective_key);
-      collectives_it->second.release();
-      collectives_.erase(collectives_it);
+      collectives_.erase(collective_key);
+      return true;
     }
   }
-  return to_run;
+  return false;
 }
 
 void NcclManager::RunCollective(Collective* collective) {
@@ -500,7 +507,7 @@ void NcclManager::RunCollective(Collective* collective) {
     for (int i = 0; i < collective->num_local_devices; ++i) {
       collective->participants[i]->done_callback(s);
     }
-    delete collective;
+    collective->Unref();
     return;
   }
 
@@ -537,9 +544,13 @@ void NcclManager::RunCollective(Collective* collective) {
           collective->communicator->members[i].nccl_stream;
       mutex_lock l(nccl_stream->mu);
       nccl_stream->pending_launches_.push_front(std::make_pair(collective, i));
+      // Ownership is shared between LoopKernelLaunches for each stream in this
+      // collective.
+      collective->Ref();
       nccl_stream->cv.notify_all();
     }
   }
+  collective->Unref();
 }
 
 void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {
@@ -629,15 +640,7 @@ void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {
         collective->participants[p_idx]->done_callback(errors::Unknown(
             "Error invoking NCCL: ", ncclGetErrorString(nccl_result)));
       }
-
-      // TODO(cwhipkey): use RefCounted after figuring out how to use in a
-      // custom op library.
-      // See tensorflow/core/lib/core/refcount.h for details on this locking.
-      if (collective->remaining_participants.load(std::memory_order_acquire) ==
-              1 ||
-          collective->remaining_participants.fetch_sub(1) == 1) {
-        delete collective;
-      }
+      collective->Unref();
     };
     p->event_mgr->ThenExecute(comm_stream, done_callback);
   }
diff --git a/tensorflow/core/nccl/nccl_manager.h b/tensorflow/core/nccl/nccl_manager.h
index f2f15f8ec64..d968fac833b 100644
--- a/tensorflow/core/nccl/nccl_manager.h
+++ b/tensorflow/core/nccl/nccl_manager.h
@@ -198,13 +198,13 @@ class NcclManager {
                       ncclRedOp_t reduction_op);
 
   // If `collective` is ready to run, removes it from the `collectives_` map and
-  // returns the pointer.  Otherwise returns `nullptr`.
+  // returns true.  Otherwise returns false.
   // Assumes `collective_key` corresponds to `collective`.
   //
   // A collective is ready to run when all local participants have called Add*
   // function, and the collective is signalled globally ready via
   // `SetMultiNodeReady`.
-  Collective* CheckReady(const string& collective_key, Collective* collective)
+  bool CheckReady(const string& collective_key, Collective* collective)
       EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Run <collective>.  This calls takes ownership of <collective>.
@@ -214,13 +214,12 @@ class NcclManager {
   mutex mu_;
 
   // Maps key to collectives currently being assembled or run.
-  std::unordered_map<string, std::unique_ptr<Collective>> collectives_
-      GUARDED_BY(mu_);
+  std::unordered_map<string, Collective*> collectives_ GUARDED_BY(mu_);
 
   // Maps a device to the communication streams that make up its collective.
   // This is used to share the stream across different communicators that
   // include the same device.
-  std::map<se::StreamExecutor*, std::vector<std::unique_ptr<NcclStream>>>
+  std::map<se::StreamExecutor*, std::vector<NcclStream*>>
       device_to_comm_streams_ GUARDED_BY(mu_);
 
   std::vector<std::unique_ptr<Communicator>> communicators_;
diff --git a/tensorflow/core/nccl/nccl_manager_test.cc b/tensorflow/core/nccl/nccl_manager_test.cc
index fcca62bdea0..06564ee8020 100644
--- a/tensorflow/core/nccl/nccl_manager_test.cc
+++ b/tensorflow/core/nccl/nccl_manager_test.cc
@@ -66,7 +66,6 @@ class NcclManagerTest : public ::testing::Test {
   static void SetUpTestCase() {
     setenv("NCCL_DEBUG", "INFO", 1 /* replace */);
     setenv("NCCL_LAUNCH_MODE", "PARALLEL", 1 /* replace */);
-    setenv("TF_CPP_VMODULE", "nccl_manager=2", 1 /* replace */);
     devices_ = new std::vector<std::unique_ptr<BaseGPUDevice>>(GetGPUDevices());
     LOG(INFO) << "Running test with " << devices_->size() << " gpus";
   }
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index af9ad33e09c..ccbf4177b98 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -17,6 +17,9 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/mirror_pad_mode.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/strided_slice_op.h"
@@ -3123,6 +3126,37 @@ REGISTER_OP("FakeQuantWithMinMaxVarsPerChannelGradient")
       return Status::OK();
     });
 
+REGISTER_OP("Fingerprint")
+    .Input("data: T")
+    .Input("method: string")
+    .Output("fingerprint: uint8")
+    .Attr("T: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+
+      DimensionHandle fingerprint_size;
+      const Tensor* method = c->input_tensor(1);
+      if (method == nullptr) {
+        fingerprint_size = c->UnknownDim();
+      } else {
+        if (method->dims() != 0) {
+          return errors::InvalidArgument("`method` must be rank 0: ",
+                                         method->shape());
+        }
+        const string& method_string = method->scalar<string>()();
+        if (method_string != "farmhash64") {
+          return errors::InvalidArgument("Unsupported method: ", method_string);
+        }
+        fingerprint_size = c->MakeDim(sizeof(uint64));
+      }
+
+      DimensionHandle batch = c->Dim(c->input(0), 0);
+      c->set_output(0, c->MakeShape({batch, fingerprint_size}));
+      return Status::OK();
+    });
+
 #ifdef INTEL_MKL
 REGISTER_OP("_MklConcat")
     .Input("concat_dim: int32")
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 7ce7d6e00c7..e98827a2528 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -167,6 +167,7 @@ REGISTER_OP("PrefetchDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
+    .Attr("slack_period: int = 0")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // buffer_size should be a scalar.
diff --git a/tensorflow/core/platform/default/cuda_build_defs.bzl b/tensorflow/core/platform/default/cuda_build_defs.bzl
new file mode 100644
index 00000000000..8b0b3f55960
--- /dev/null
+++ b/tensorflow/core/platform/default/cuda_build_defs.bzl
@@ -0,0 +1,8 @@
+"""Open source build configurations for CUDA."""
+
+load("@local_config_cuda//cuda:build_defs.bzl", _if_cuda_is_configured = "if_cuda_is_configured")
+
+# We perform this indirection so that the copybara tool can distinguish this
+# macro from others provided by the same file.
+def if_cuda_is_configured(x):
+    return _if_cuda_is_configured(x)
diff --git a/tensorflow/core/platform/default/cuda_libdevice_path.cc b/tensorflow/core/platform/default/cuda_libdevice_path.cc
index a8b2e7202ac..25eb6ab463b 100644
--- a/tensorflow/core/platform/default/cuda_libdevice_path.cc
+++ b/tensorflow/core/platform/default/cuda_libdevice_path.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <vector>
 
 #if !defined(PLATFORM_GOOGLE)
-#include "cuda/cuda_config.h"
+#include "third_party/gpus/cuda/cuda_config.h"
 #endif
 #include "tensorflow/core/platform/logging.h"
 
diff --git a/tensorflow/core/platform/default/device_tracer.cc b/tensorflow/core/platform/default/device_tracer.cc
index 2f13f127116..38cdb65c566 100644
--- a/tensorflow/core/platform/default/device_tracer.cc
+++ b/tensorflow/core/platform/default/device_tracer.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
-#include "cuda/extras/CUPTI/include/cupti.h"
+#include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
diff --git a/tensorflow/core/platform/fingerprint.h b/tensorflow/core/platform/fingerprint.h
index d460514accc..ae41a8e541a 100644
--- a/tensorflow/core/platform/fingerprint.h
+++ b/tensorflow/core/platform/fingerprint.h
@@ -81,7 +81,12 @@ inline uint64 Fingerprint64(StringPiece s) {
 #ifdef USE_OSS_FARMHASH
   return ::util::Fingerprint64(s.data(), s.size());
 #else
+  // Fingerprint op depends on the fact that Fingerprint64() is implemented by
+  // Farmhash. If the implementation ever changes, Fingerprint op should be
+  // modified to keep using Farmhash.
+  // LINT.IfChange
   return farmhash::Fingerprint64(s.data(), s.size());
+  // LINT.ThenChange(//third_party/tensorflow/core/kernels/fingerprint_op.cc)
 #endif
 }
 
diff --git a/tensorflow/core/platform/posix/posix_file_system.cc b/tensorflow/core/platform/posix/posix_file_system.cc
index 083284c5ff9..590fdc190f5 100644
--- a/tensorflow/core/platform/posix/posix_file_system.cc
+++ b/tensorflow/core/platform/posix/posix_file_system.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <dirent.h>
 #include <errno.h>
 #include <fcntl.h>
+#include <stdint.h>
 #include <stdio.h>
 #include <sys/mman.h>
 #if defined(__linux__)
@@ -62,7 +63,16 @@ class PosixRandomAccessFile : public RandomAccessFile {
     Status s;
     char* dst = scratch;
     while (n > 0 && s.ok()) {
-      ssize_t r = pread(fd_, dst, n, static_cast<off_t>(offset));
+      // Some platforms, notably macs, throw EINVAL if pread is asked to read
+      // more than fits in a 32-bit integer.
+      size_t requested_read_length;
+      if (n > INT32_MAX) {
+        requested_read_length = INT32_MAX;
+      } else {
+        requested_read_length = n;
+      }
+      ssize_t r =
+          pread(fd_, dst, requested_read_length, static_cast<off_t>(offset));
       if (r > 0) {
         dst += r;
         n -= r;
@@ -105,6 +115,9 @@ class PosixWritableFile : public WritableFile {
   }
 
   Status Close() override {
+    if (file_ == nullptr) {
+      return IOError(filename_, EBADF);
+    }
     Status result;
     if (fclose(file_) != 0) {
       result = IOError(filename_, errno);
@@ -325,10 +338,9 @@ Status PosixFileSystem::CopyFile(const string& src, const string& target) {
   string translated_target = TranslateName(target);
   // O_WRONLY | O_CREAT:
   //   Open file for write and if file does not exist, create the file.
-  // S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH:
-  //   Create the file with permission of 0644
-  int target_fd = open(translated_target.c_str(), O_WRONLY | O_CREAT,
-                       S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+  // When creating file, use the same permissions as original
+  mode_t mode = sbuf.st_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
+  int target_fd = open(translated_target.c_str(), O_WRONLY | O_CREAT, mode);
   if (target_fd < 0) {
     close(src_fd);
     return IOError(target, errno);
diff --git a/tensorflow/core/profiler/internal/traceme_recorder.cc b/tensorflow/core/profiler/internal/traceme_recorder.cc
index 1c6be97a0e4..b2a20c7955b 100644
--- a/tensorflow/core/profiler/internal/traceme_recorder.cc
+++ b/tensorflow/core/profiler/internal/traceme_recorder.cc
@@ -14,21 +14,6 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/internal/traceme_recorder.h"
 
-// To avoid unnecessary synchronization between threads, each thread has a
-// ThreadLocalRecorder that independently records its events.
-//
-// Events are stored in an EventQueue implemented as a linked-list of blocks,
-// with start and end pointers:
-//  [ events........ | next-]--> [ events......... | next ]
-//  ^start_block  ^start         ^end_block  ^end
-//
-// Record() writes at end, and then advances it, allocating a block if needed.
-// Clear() takes ownership of events in the range [start, end).
-// The end pointer is atomic so these can be concurrent.
-//
-// If a thread dies, the ThreadLocalRecorder's destructor hands its data off to
-// the orphaned_events list.
-
 #include <cstddef>
 
 #include "tensorflow/core/platform/env.h"
@@ -48,17 +33,27 @@ namespace {
 
 // A single-producer single-consumer queue of Events.
 //
-// Push and Consume are lock free and each might be called from at most one
-// thread. Push is only be called by the owner thread. Consume is called by the
-// owner thread when it shuts down, or by the tracing control thread.
-// Thus, Consume might race with Push, so Consume only removes events that were
-// in the queue when it was invoked. If Push is called while Consume is active,
-// the new event remains in the queue. Thus, the tracing control thread should
-// call Consume when tracing stops to remove events created during tracing, but
-// also when tracing starts again to clear any remaining events.
+// Implemented as a linked-list of blocks containing numbered slots, with start
+// and end pointers:
 //
-// Internally, we have a linked list of blocks containing numbered slots.
-// start is the first occupied slot, end is the first unoccupied slot.
+//  [ events........ | next-]--> [ events......... | next ]
+//  ^start_block_ ^start_         ^end_block_ ^end_
+//
+// start_ is the first occupied slot, end_ is the first unoccupied slot.
+//
+// Push writes at end_, and then advances it, allocating a block if needed.
+// PopAll takes ownership of events in the range [start_, end_).
+// The end_ pointer is atomic so Push and PopAll can be concurrent.
+//
+// Push and PopAll are lock free and each might be called from at most one
+// thread. Push is only called by the owner thread. PopAll is called by the
+// owner thread when it shuts down, or by the tracing control thread.
+//
+// Thus, PopAll might race with Push, so PopAll only removes events that were
+// in the queue when it was invoked. If Push is called while PopAll is active,
+// the new event remains in the queue. Thus, the tracing control thread should
+// call PopAll when tracing stops to remove events created during tracing, but
+// also when tracing starts again to clear any remaining events.
 class EventQueue {
  public:
   EventQueue()
@@ -67,13 +62,13 @@ class EventQueue {
         end_block_(start_block_),
         end_(start_) {}
 
-  // REQUIRES: Consume() was called since the last Push().
+  // REQUIRES: PopAll() was called since the last Push().
   // Memory should be deallocated and trace events destroyed on destruction.
   // This doesn't require global lock as this discards all the stored trace
-  // events and we assume of destruction of this class only after the last
+  // events and we assume of destruction of this instance only after the last
   // Push() has been called.
   ~EventQueue() {
-    DCHECK_EQ(start_, end_.load()) << "EventQueue destroyed without Consume()";
+    DCHECK(Empty()) << "EventQueue destroyed without PopAll()";
     delete end_block_;
   }
 
@@ -91,25 +86,32 @@ class EventQueue {
   }
 
   // Retrieve and remove all events in the queue at the time of invocation.
-  // If Push is called while Consume is active, the new event will not be
+  // If Push is called while PopAll is active, the new event will not be
   // removed from the queue.
-  std::vector<TraceMeRecorder::Event> Consume() {
+  std::vector<TraceMeRecorder::Event> PopAll() {
     // Read index before contents.
     size_t end = end_.load(std::memory_order_acquire);
     std::vector<TraceMeRecorder::Event> result;
     result.reserve(end - start_);
     while (start_ != end) {
-      Shift(&result);
+      result.emplace_back(Pop());
     }
     return result;
   }
 
  private:
-  // Shift one event off the front of the queue into *out.
-  void Shift(std::vector<TraceMeRecorder::Event>* out) {
+  // Returns true if the queue is empty at the time of invocation.
+  bool Empty() const {
+    return (start_ == end_.load(std::memory_order_acquire));
+  }
+
+  // Remove one event off the front of the queue and return it.
+  // REQUIRES: The queue must not be empty.
+  TraceMeRecorder::Event Pop() {
+    DCHECK(!Empty());
     // Move the next event into the output.
     auto& event = start_block_->events[start_++ - start_block_->start].event;
-    out->push_back(std::move(event));
+    TraceMeRecorder::Event out = std::move(event);
     event.~Event();  // Events must be individually destroyed.
     // If we reach the end of a block, we own it and should delete it.
     // The next block is present: end always points to something.
@@ -117,10 +119,11 @@ class EventQueue {
       auto* next_block = start_block_->next;
       delete start_block_;
       start_block_ = next_block;
+      DCHECK_EQ(start_, start_block_->start);
     }
+    return out;
   }
 
-  // The number of slots in a block. Chosen so that the block fits in 64k.
   struct Block {
     // The number of slots in a block is chosen so the block fits in 64 KiB.
     static constexpr size_t kSize = 1 << 16;
@@ -151,6 +154,8 @@ class EventQueue {
 
 }  // namespace
 
+// To avoid unnecessary synchronization between threads, each thread has a
+// ThreadLocalRecorder that independently records its events.
 class TraceMeRecorder::ThreadLocalRecorder {
  public:
   // The recorder is created the first time TraceMeRecorder::Record() is called
@@ -170,7 +175,7 @@ class TraceMeRecorder::ThreadLocalRecorder {
 
   // Clear is called from the control thread when tracing starts/stops, or from
   // the owner thread when it shuts down (see destructor).
-  TraceMeRecorder::ThreadEvents Clear() { return {info_, queue_.Consume()}; }
+  TraceMeRecorder::ThreadEvents Clear() { return {info_, queue_.PopAll()}; }
 
  private:
   TraceMeRecorder::ThreadInfo info_;
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index d85664029fc..4d6212422fd 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -492,6 +492,12 @@ message ConfigProto {
     // but in the case where there is a lot of spinning may result in lower
     // CPU usage.
     bool disable_thread_spinning = 9;
+
+    // When true, WorkerSessions are created with device attributes from the
+    // full cluster.
+    // This is helpful when a worker wants to partition a graph
+    // (for example during a PartitionedCallOp).
+    bool share_cluster_devices_in_session = 10;
   };
 
   Experimental experimental = 16;
diff --git a/tensorflow/core/protobuf/worker.proto b/tensorflow/core/protobuf/worker.proto
index e7685d51024..0bea9aa4ee5 100644
--- a/tensorflow/core/protobuf/worker.proto
+++ b/tensorflow/core/protobuf/worker.proto
@@ -67,6 +67,9 @@ message CreateWorkerSessionRequest {
   // If true, any resources such as Variables used in the session will not be
   // shared with other sessions.
   bool isolate_session_state = 3;
+
+  // The device attributes of all the devices in the cluster.
+  repeated DeviceAttributes cluster_device_attributes = 4;
 }
 
 message CreateWorkerSessionResponse {}
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 95b443ef3a5..24dab898d59 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -36,8 +36,6 @@ limitations under the License.
   (TF_STR(TF_MAJOR_VERSION) "." TF_STR(TF_MINOR_VERSION) "." TF_STR( \
       TF_PATCH_VERSION) TF_VERSION_SUFFIX)
 
-// TODO(josh11b): Public API functions for exporting the above.
-
 // GraphDef compatibility versions (the versions field in graph.proto).
 //
 // Each graph has producer and min_consumer versions, and each
@@ -100,12 +98,17 @@ limitations under the License.
 //     deprecated in favor of V2 ops. (2018/01/23)
 // 28. Deprecate MatrixExponential op in favor of Python implementation.
 //     (2018/08/21).
+// (2019/02/15). Added `control_ret` field to FunctionDef proto, and
+//     `control_output` field to OpDef proto.
 // 29. Deprecate StatefulStandardNormal op in favor of StatefulStandardNormalV2.
 //     (2019/03/25).
+// (2019/04/17). Added `arg_attr` field to FunctionDefProto.
+// 30. (2019/05/09) First date based GraphDef version. GraphDef
+//     versions advance by 1 each day after this point.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 29
+#define TF_GRAPH_DEF_VERSION 30  // Updated: 2019/05/09
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/util/device_name_utils.cc b/tensorflow/core/util/device_name_utils.cc
index 181708c90d2..35d34221a6c 100644
--- a/tensorflow/core/util/device_name_utils.cc
+++ b/tensorflow/core/util/device_name_utils.cc
@@ -538,4 +538,10 @@ std::vector<string> DeviceNameUtils::GetLocalNamesForDeviceMappings(
   return Status::OK();
 }
 
+std::ostream& operator<<(std::ostream& os,
+                         const DeviceNameUtils::ParsedName& x) {
+  os << DeviceNameUtils::ParsedNameToString(x);
+  return os;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/device_name_utils.h b/tensorflow/core/util/device_name_utils.h
index 45b950249ef..651231d4db8 100644
--- a/tensorflow/core/util/device_name_utils.h
+++ b/tensorflow/core/util/device_name_utils.h
@@ -193,6 +193,9 @@ class DeviceNameUtils {
                                           string* host_device_name);
 };
 
+std::ostream& operator<<(std::ostream& os,
+                         const DeviceNameUtils::ParsedName& x);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_UTIL_DEVICE_NAME_UTILS_H_
diff --git a/tensorflow/core/util/gpu_cuda_alias.h b/tensorflow/core/util/gpu_cuda_alias.h
new file mode 100644
index 00000000000..5a05700d34a
--- /dev/null
+++ b/tensorflow/core/util/gpu_cuda_alias.h
@@ -0,0 +1,60 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_GPU_CUDA_ALIAS_H_
+#define TENSORFLOW_CORE_UTIL_GPU_CUDA_ALIAS_H_
+
+// Several forwarding macros are defined in this file to serve for backward
+// compatibility usage as we migrating from Cuda prefixed function to Gpu
+// prefixed functions. Both Cuda and ROCm can unify under the new Gpu prefix
+// naming scheme. In the migration period, we provide equivalent Cuda* and Gpu*
+// function. Over time, all Cuda* functions will be deprecated.
+
+namespace tensorflow {
+
+// CREATE_CUDA_HOST_FUNCTION_ALIAS forward the host function to its Cuda Alias.
+#ifndef TENSORFLOW_USE_ROCM
+#define CREATE_CUDA_HOST_FUNCTION_ALIAS(func, cuda_alias) \
+  template <typename... Args>                             \
+  auto cuda_alias(Args&&... args)                         \
+      ->decltype(func(std::forward<Args>(args)...)) {     \
+    return func(std::forward<Args>(args)...);             \
+  }
+#else
+#define CREATE_CUDA_HOST_FUNCTION_ALIAS(func, cuda_alias)
+#endif
+
+// CREATE_CUDA_DEVICE_FUNCTION_ALIAS forward the device function to its Cuda
+// Alias.
+#ifndef TENSORFLOW_USE_ROCM
+#define CREATE_CUDA_DEVICE_FUNCTION_ALIAS(func, cuda_alias) \
+  template <typename... Args>                               \
+  __device__ auto cuda_alias(Args&&... args)                \
+      ->decltype(func(std::forward<Args>(args)...)) {       \
+    return func(std::forward<Args>(args)...);               \
+  }
+#else
+#define CREATE_CUDA_DEVICE_FUNCTION_ALIAS(func, cuda_alias)
+#endif
+
+// CREATE_CUDA_TYPE_ALIAS forward the type to its Cuda Alias.
+#ifndef TENSORFLOW_USE_ROCM
+#define CREATE_CUDA_TYPE_ALIAS(type, cuda_alias) using cuda_alias = type;
+#else
+#define CREATE_CUDA_TYPE_ALIAS(type, cuda_alias)
+#endif
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_GPU_CUDA_ALIAS_H_
diff --git a/tensorflow/core/util/gpu_device_functions.h b/tensorflow/core/util/gpu_device_functions.h
index 7e8742b6a15..049d6e00b86 100644
--- a/tensorflow/core/util/gpu_device_functions.h
+++ b/tensorflow/core/util/gpu_device_functions.h
@@ -31,8 +31,8 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #if GOOGLE_CUDA
-#include "cuda/include/cuComplex.h"
-#include "cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cuComplex.h"
+#include "third_party/gpus/cuda/include/cuda.h"
 #endif
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/core/util/gpu_kernel_helper.h b/tensorflow/core/util/gpu_kernel_helper.h
index b3c2f21b392..3cc8a20e504 100644
--- a/tensorflow/core/util/gpu_kernel_helper.h
+++ b/tensorflow/core/util/gpu_kernel_helper.h
@@ -19,7 +19,7 @@ limitations under the License.
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #if GOOGLE_CUDA
-#include "cuda/include/cuda_fp16.h"
+#include "third_party/gpus/cuda/include/cuda_fp16.h"
 #endif
 #include "tensorflow/core/util/gpu_device_functions.h"
 #include "tensorflow/core/util/gpu_launch_config.h"
@@ -30,18 +30,6 @@ limitations under the License.
 #define TF_RED_WARPSIZE 64
 #endif
 
-#if GOOGLE_CUDA
-#define GPU_LAUNCH_KERNEL(kernel, block_count, threads_per_block, shared_mem, \
-                          stream, ...)                                        \
-  TF_CHECK_OK(CudaLaunchKernel(kernel, block_count, threads_per_block,        \
-                               shared_mem, stream, __VA_ARGS__));
-#elif TENSORFLOW_USE_ROCM
-#define GPU_LAUNCH_KERNEL(kernel, block_count, threads_per_block, shared_mem, \
-                          stream, ...)                                        \
-  hipLaunchKernelGGL(kernel, block_count, threads_per_block, shared_mem,      \
-                     stream, __VA_ARGS__);
-#endif
-
 // Deprecated, use 'for(int i : CudaGridRangeX(n))' instead.
 #define CUDA_1D_KERNEL_LOOP(i, n) \
   for (int i : ::tensorflow::CudaGridRangeX<int>(n))
diff --git a/tensorflow/core/util/gpu_launch_config.h b/tensorflow/core/util/gpu_launch_config.h
index 9e056c591a6..2d08e2b988e 100644
--- a/tensorflow/core/util/gpu_launch_config.h
+++ b/tensorflow/core/util/gpu_launch_config.h
@@ -26,45 +26,46 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/gpu_cuda_alias.h"
 
-// Usage of GetCudaLaunchConfig, GetCuda2DLaunchConfig, and
-// GetCuda3DLaunchConfig:
+// Usage of GetGpuLaunchConfig, GetGpu2DLaunchConfig, and
+// GetGpu3DLaunchConfig:
 //
-// There are two versions of GetCudaLaunchConfig and GetCuda2DLaunchConfig, one
+// There are two versions of GetGpuLaunchConfig and GetGpu2DLaunchConfig, one
 // version uses heuristics without any knowledge of the device kernel, the other
 // version uses cudaOccupancyMaxPotentialBlockSize to determine the theoretical
 // launch parameters that maximize occupancy. Currently, only the maximum
-// occupancy version of GetCuda3DLaunchConfig is available.
+// occupancy version of GetGpu3DLaunchConfig is available.
 //
 // For large number of work elements, the convention is that each kernel would
-// iterate through its assigned range. The return value of GetCudaLaunchConfig
-// is struct CudaLaunchConfig, which contains all the information needed for the
+// iterate through its assigned range. The return value of GetGpuLaunchConfig
+// is struct GpuLaunchConfig, which contains all the information needed for the
 // kernel launch, including: virtual number of threads, the number of threads
 // per block and number of threads per block used inside <<< >>> of a kernel
-// launch. GetCuda2DLaunchConfig and GetCuda3DLaunchConfig does the same thing
-// as CudaLaunchConfig. The only difference is the dimension. The macros
-// CUDA_1D_KERNEL_LOOP and CUDA_AXIS_KERNEL_LOOP might be used to do inner loop.
+// launch. GetGpu2DLaunchConfig and GetGpu3DLaunchConfig does the same thing
+// as GpuLaunchConfig. The only difference is the dimension. The macros
+// GPU_1D_KERNEL_LOOP and GPU_AXIS_KERNEL_LOOP might be used to do inner loop.
 //
 /* Sample code:
 
-__global__ void MyKernel1D(CudaLaunchConfig config, other_args...) {
-  CUDA_1D_KERNEL_LOOP(x, config.virtual_thread_count) {
+__global__ void MyKernel1D(GpuLaunchConfig config, other_args...) {
+  GPU_1D_KERNEL_LOOP(x, config.virtual_thread_count) {
     do_your_job_here;
   }
 }
 
-__global__ void MyKernel2D(Cuda2DLaunchConfig config, other_args...) {
-  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
-    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
+__global__ void MyKernel2D(Gpu2DLaunchConfig config, other_args...) {
+  GPU_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
+    GPU_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
       do_your_job_here;
     }
   }
 }
 
-__global__ void MyKernel3D(Cuda3DLaunchConfig config, other_args...) {
-  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
-    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
-      CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count, z) {
+__global__ void MyKernel3D(Gpu3DLaunchConfig config, other_args...) {
+  GPU_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
+    GPU_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
+      GPU_AXIS_KERNEL_LOOP(z, config.virtual_thread_count, z) {
         do_your_job_here;
       }
     }
@@ -73,25 +74,25 @@ __global__ void MyKernel3D(Cuda3DLaunchConfig config, other_args...) {
 
 void MyDriverFunc(const Eigen::GpuDevice &d) {
   // use heuristics
-  CudaLaunchConfig cfg1 = GetCudaLaunchConfig(10240, d);
+  GpuLaunchConfig cfg1 = GetGpuLaunchConfig(10240, d);
   MyKernel1D <<<config.block_count,
                 config.thread_per_block, 0, d.stream()>>> (cfg1, other_args...);
-  Cuda2DLaunchConfig cfg2 = GetCuda2DLaunchConfig(10240, 10240, d);
+  Gpu2DLaunchConfig cfg2 = GetGpu2DLaunchConfig(10240, 10240, d);
   MyKernel2D <<<config.block_count,
                 config.thread_per_block, 0, d.stream()>>> (cfg2, other_args...);
-  Cuda3DLaunchConfig cfg3 = GetCuda3DLaunchConfig(4096, 4096, 100, d);
+  Gpu3DLaunchConfig cfg3 = GetGpu3DLaunchConfig(4096, 4096, 100, d);
   MyKernel3D <<<config.block_count,
                 config.thread_per_block, 0, d.stream()>>> (cfg3, other_args...);
 
   // maximize occupancy
-  CudaLaunchConfig cfg4 = GetCudaLaunchConfig(10240, d, MyKernel1D, 0, 0 );
+  GpuLaunchConfig cfg4 = GetGpuLaunchConfig(10240, d, MyKernel1D, 0, 0 );
   MyKernel1D <<<config.block_count,
                 config.thread_per_block, 0, d.stream()>>> (cfg4, other_args...);
-  Cuda2DLaunchConfig cfg5 = GetCuda2DLaunchConfig(10240, 10240, d,
+  Gpu2DLaunchConfig cfg5 = GetGpu2DLaunchConfig(10240, 10240, d,
                                                   MyKernel1D, 0, 0);
   MyKernel2D <<<config.block_count,
                 config.thread_per_block, 0, d.stream()>>> (cfg5, other_args...);
-  Cuda3DLaunchConfig cfg6 = GetCuda3DLaunchConfig(4096, 4096, 100, d,
+  Gpu3DLaunchConfig cfg6 = GetGpu3DLaunchConfig(4096, 4096, 100, d,
                                                   MyKernel1D, 0, 0);
   MyKernel3D <<<config.block_count,
                 config.thread_per_block, 0, d.stream()>>> (cfg6, other_args...);
@@ -107,25 +108,26 @@ namespace tensorflow {
 
 inline int DivUp(int a, int b) { return (a + b - 1) / b; }
 
-struct CudaLaunchConfig {
+struct GpuLaunchConfig {
   // Logical number of thread that works on the elements. If each logical
   // thread works on exactly a single element, this is the same as the working
   // element count.
   int virtual_thread_count = -1;
   // Number of threads per block.
   int thread_per_block = -1;
-  // Number of blocks for Cuda kernel launch.
+  // Number of blocks for GPU kernel launch.
   int block_count = -1;
 };
+CREATE_CUDA_TYPE_ALIAS(GpuLaunchConfig, CudaLaunchConfig);
 
-// Calculate the Cuda launch config we should use for a kernel launch.
+// Calculate the GPU launch config we should use for a kernel launch.
 // This is assuming the kernel is quite simple and will largely be
 // memory-limited.
 // REQUIRES: work_element_count > 0.
-inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
-                                            const Eigen::GpuDevice& d) {
+inline GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
+                                          const Eigen::GpuDevice& d) {
   CHECK_GT(work_element_count, 0);
-  CudaLaunchConfig config;
+  GpuLaunchConfig config;
   const int virtual_thread_count = work_element_count;
   const int physical_thread_count = std::min(
       d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
@@ -140,18 +142,21 @@ inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
   config.block_count = block_count;
   return config;
 }
+inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
+                                            const Eigen::GpuDevice& d) {
+  return GetGpuLaunchConfig(work_element_count, d);
+}
 
-// Calculate the Cuda launch config we should use for a kernel launch. This
+// Calculate the GPU launch config we should use for a kernel launch. This
 // variant takes the resource limits of func into account to maximize occupancy.
 // REQUIRES: work_element_count > 0.
 template <typename DeviceFunc>
-inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
-                                            const Eigen::GpuDevice& d,
-                                            DeviceFunc func,
-                                            size_t dynamic_shared_memory_size,
-                                            int block_size_limit) {
+GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
+                                   const Eigen::GpuDevice& d, DeviceFunc func,
+                                   size_t dynamic_shared_memory_size,
+                                   int block_size_limit) {
   CHECK_GT(work_element_count, 0);
-  CudaLaunchConfig config;
+  GpuLaunchConfig config;
   int block_count = 0;
   int thread_per_block = 0;
 
@@ -188,17 +193,25 @@ inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
   config.block_count = block_count;
   return config;
 }
+template <typename DeviceFunc>
+CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
+                                     const Eigen::GpuDevice& d, DeviceFunc func,
+                                     size_t dynamic_shared_memory_size,
+                                     int block_size_limit) {
+  return GetGpuLaunchConfig(work_element_count, d, func,
+                            dynamic_shared_memory_size, block_size_limit);
+}
 
-// Calculate the Cuda launch config we should use for a kernel launch. This
+// Calculate the GPU launch config we should use for a kernel launch. This
 // variant takes the resource limits of func into account to maximize occupancy.
 // The returned launch config has thread_per_block set to fixed_block_size.
 // REQUIRES: work_element_count > 0.
 template <typename DeviceFunc>
-inline CudaLaunchConfig GetCudaLaunchConfigFixedBlockSize(
+GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize(
     int work_element_count, const Eigen::GpuDevice& d, DeviceFunc func,
     size_t dynamic_shared_memory_size, int fixed_block_size) {
   CHECK_GT(work_element_count, 0);
-  CudaLaunchConfig config;
+  GpuLaunchConfig config;
   int block_count = 0;
 
 #if GOOGLE_CUDA
@@ -232,16 +245,25 @@ inline CudaLaunchConfig GetCudaLaunchConfigFixedBlockSize(
   config.block_count = block_count;
   return config;
 }
+template <typename DeviceFunc>
+CudaLaunchConfig GetCudaLaunchConfigFixedBlockSize(
+    int work_element_count, const Eigen::GpuDevice& d, DeviceFunc func,
+    size_t dynamic_shared_memory_size, int fixed_block_size) {
+  return GetGpuLaunchConfigFixedBlockSize(work_element_count, d, func,
+                                          dynamic_shared_memory_size,
+                                          fixed_block_size);
+}
 
-struct Cuda2DLaunchConfig {
+struct Gpu2DLaunchConfig {
   dim3 virtual_thread_count = dim3(0, 0, 0);
   dim3 thread_per_block = dim3(0, 0, 0);
   dim3 block_count = dim3(0, 0, 0);
 };
+CREATE_CUDA_TYPE_ALIAS(Gpu2DLaunchConfig, Cuda2DLaunchConfig);
 
-inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(int xdim, int ydim,
-                                                const Eigen::GpuDevice& d) {
-  Cuda2DLaunchConfig config;
+inline Gpu2DLaunchConfig GetGpu2DLaunchConfig(int xdim, int ydim,
+                                              const Eigen::GpuDevice& d) {
+  Gpu2DLaunchConfig config;
 
   if (xdim <= 0 || ydim <= 0) {
     return config;
@@ -266,17 +288,24 @@ inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(int xdim, int ydim,
       grid_x, std::min(max_blocks / grid_x, std::max(ydim / block_rows, 1)), 1);
   return config;
 }
+inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(int xdim, int ydim,
+                                                const Eigen::GpuDevice& d) {
+  return GetGpu2DLaunchConfig(xdim, ydim, d);
+}
 
-// Calculate the Cuda 2D and 3D launch config we should use for a kernel launch.
+// Calculate the GPU 2D and 3D launch config we should use for a kernel launch.
 // This variant takes the resource limits of func into account to maximize
 // occupancy.
-using Cuda3DLaunchConfig = Cuda2DLaunchConfig;
+using Gpu3DLaunchConfig = Gpu2DLaunchConfig;
+CREATE_CUDA_TYPE_ALIAS(Gpu3DLaunchConfig, Cuda3DLaunchConfig);
 
 template <typename DeviceFunc>
-inline Cuda3DLaunchConfig GetCuda3DLaunchConfig(
-    int xdim, int ydim, int zdim, const Eigen::GpuDevice& d, DeviceFunc func,
-    size_t dynamic_shared_memory_size, int block_size_limit) {
-  Cuda3DLaunchConfig config;
+Cuda3DLaunchConfig GetGpu3DLaunchConfig(int xdim, int ydim, int zdim,
+                                        const Eigen::GpuDevice& d,
+                                        DeviceFunc func,
+                                        size_t dynamic_shared_memory_size,
+                                        int block_size_limit) {
+  Gpu3DLaunchConfig config;
 
   if (xdim <= 0 || ydim <= 0 || zdim <= 0) {
     return config;
@@ -340,13 +369,24 @@ inline Cuda3DLaunchConfig GetCuda3DLaunchConfig(
   config.block_count = dim3(blocksx, blocksy, blocksz);
   return config;
 }
+template <typename DeviceFunc>
+Cuda3DLaunchConfig GetCuda3DLaunchConfig(int xdim, int ydim, int zdim,
+                                         const Eigen::GpuDevice& d,
+                                         DeviceFunc func,
+                                         size_t dynamic_shared_memory_size,
+                                         int block_size_limit) {
+  return GetGpu3DLaunchConfig(xdim, ydim, zdim, d, func,
+                              dynamic_shared_memory_size, block_size_limit);
+}
 
 template <typename DeviceFunc>
-inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(
-    int xdim, int ydim, const Eigen::GpuDevice& d, DeviceFunc func,
-    size_t dynamic_shared_memory_size, int block_size_limit) {
-  return GetCuda3DLaunchConfig(xdim, ydim, 1, d, func,
-                               dynamic_shared_memory_size, block_size_limit);
+Gpu2DLaunchConfig GetGpu2DLaunchConfig(int xdim, int ydim,
+                                       const Eigen::GpuDevice& d,
+                                       DeviceFunc func,
+                                       size_t dynamic_shared_memory_size,
+                                       int block_size_limit) {
+  return GetGpu3DLaunchConfig(xdim, ydim, 1, d, func,
+                              dynamic_shared_memory_size, block_size_limit);
 }
 
 #if GOOGLE_CUDA
@@ -361,6 +401,15 @@ inline const cudaStream_t& GetCudaStream(OpKernelContext* context) {
                                                 ->GpuStreamMemberHack()));
   return *ptr;
 }
+template <typename DeviceFunc>
+Cuda2DLaunchConfig GetCuda2DLaunchConfig(int xdim, int ydim,
+                                         const Eigen::GpuDevice& d,
+                                         DeviceFunc func,
+                                         size_t dynamic_shared_memory_size,
+                                         int block_size_limit) {
+  return GetGpu2DLaunchConfig(xdim, ydim, d, func, dynamic_shared_memory_size,
+                              block_size_limit);
+}
 #endif  // GOOGLE_CUDA
 
 namespace detail {
diff --git a/tensorflow/core/util/tensor_format.cc b/tensorflow/core/util/tensor_format.cc
index f331973f5ce..5dbd8ef318f 100644
--- a/tensorflow/core/util/tensor_format.cc
+++ b/tensorflow/core/util/tensor_format.cc
@@ -63,6 +63,8 @@ string ToString(FilterTensorFormat format) {
       return "HWIO";
     case FORMAT_OIHW:
       return "OIHW";
+    case FORMAT_OHWI:
+      return "OHWI";
     case FORMAT_OIHW_VECT_I:
       return "OIHW_VECT_I";
     default:
diff --git a/tensorflow/core/util/tensor_format.h b/tensorflow/core/util/tensor_format.h
index 643e14e0b56..82af5c545f7 100644
--- a/tensorflow/core/util/tensor_format.h
+++ b/tensorflow/core/util/tensor_format.h
@@ -80,6 +80,9 @@ enum FilterTensorFormat {
   // FORMAT_OIHW often improves performance on GPUs.
   FORMAT_OIHW = 1,
 
+  // FORMAT_OHWI used by cuDNN for NHWC convolutions.
+  FORMAT_OHWI = 2,
+
   // OIHW_VECT_I is the most performant tensor format for cudnn6's quantized
   // int8 convolution and fused convolution. It is analogous to the NCHW_VECT_C
   // data format. It is laid out in the same order as OIHW, except that the size
@@ -88,7 +91,7 @@ enum FilterTensorFormat {
   // int32. Thus an OIHW format filter with dimensions [O, I, H, W] would have
   // dimensions [O, I/4, H, W, 4] in OIHW_VECT_I format.
   // A pre-condition of this format is that I must be a multiple of 4.
-  FORMAT_OIHW_VECT_I = 2,
+  FORMAT_OIHW_VECT_I = 3,
 };
 
 // Parse tensor format from the given string.
diff --git a/tensorflow/core/util/test_log.proto b/tensorflow/core/util/test_log.proto
index 823c5803e9d..ddb0599388f 100644
--- a/tensorflow/core/util/test_log.proto
+++ b/tensorflow/core/util/test_log.proto
@@ -212,4 +212,8 @@ message TestResults {
   // * presubmit: results from oneshot requests.
   // * culprit: results from culprit finder rerun.
   string run_mode = 11;
+
+  // TensorFlow version this benchmark runs against.
+  // This can be either set to full version or just the major version.
+  string tf_version = 12;
 };
diff --git a/tensorflow/examples/adding_an_op/BUILD b/tensorflow/examples/adding_an_op/BUILD
index a4d6f204cd9..e3ee520e760 100644
--- a/tensorflow/examples/adding_an_op/BUILD
+++ b/tensorflow/examples/adding_an_op/BUILD
@@ -69,7 +69,10 @@ py_test(
     size = "small",
     srcs = ["zero_out_1_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["notap"],
+    tags = [
+        "no_pip",
+        "notap",
+    ],
     deps = [
         ":zero_out_op_1",
         "//tensorflow:tensorflow_py",
@@ -81,7 +84,10 @@ py_test(
     size = "small",
     srcs = ["zero_out_2_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["notap"],
+    tags = [
+        "no_pip",
+        "notap",
+    ],
     deps = [
         ":zero_out_grad_2",
         ":zero_out_op_2",
@@ -94,7 +100,10 @@ py_test(
     size = "small",
     srcs = ["zero_out_3_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["notap"],
+    tags = [
+        "no_pip",
+        "notap",
+    ],
     deps = [
         ":zero_out_op_3",
         "//tensorflow:tensorflow_py",
@@ -121,7 +130,10 @@ py_test(
     srcs = ["cuda_op_test.py"],
     exec_compatible_with = tf_exec_compatible_with({"tags": tf_cuda_tests_tags()}),
     srcs_version = "PY2AND3",
-    tags = tf_cuda_tests_tags() + ["notap"],
+    tags = tf_cuda_tests_tags() + [
+        "notap",
+        "no_pip",
+    ],
     deps = [
         ":cuda_op",
         "//tensorflow:tensorflow_py",
diff --git a/tensorflow/examples/android/jni/object_tracking/jni_utils.h b/tensorflow/examples/android/jni/object_tracking/jni_utils.h
index 06048ecfd36..5f622a2e65f 100644
--- a/tensorflow/examples/android/jni/object_tracking/jni_utils.h
+++ b/tensorflow/examples/android/jni/object_tracking/jni_utils.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_JNI_UTILS_H_
 #define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_JNI_UTILS_H_
 
+#include <jni.h>
 #include <stdint.h>
 
 #include "tensorflow/examples/android/jni/object_tracking/utils.h"
diff --git a/tensorflow/examples/android/jni/object_tracking/sprite.h b/tensorflow/examples/android/jni/object_tracking/sprite.h
index b54a68458f1..964f1c30bfa 100755
--- a/tensorflow/examples/android/jni/object_tracking/sprite.h
+++ b/tensorflow/examples/android/jni/object_tracking/sprite.h
@@ -16,16 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_SPRITE_H_
 #define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_SPRITE_H_
 
+#ifdef __RENDER_OPENGL__
+
 #include <GLES/gl.h>
 #include <GLES/glext.h>
 
 #include "tensorflow/examples/android/jni/object_tracking/image-inl.h"
 #include "tensorflow/examples/android/jni/object_tracking/image.h"
 
-#ifndef __RENDER_OPENGL__
-#error sprite.h should not included if OpenGL is not enabled by platform.h
-#endif
-
 namespace tf_tracking {
 
 // This class encapsulates the logic necessary to load an render image data
@@ -199,4 +197,6 @@ class Sprite {
 
 }  // namespace tf_tracking
 
+#endif  // __RENDER_OPENGL__
+
 #endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_SPRITE_H_
diff --git a/tensorflow/examples/label_image/BUILD b/tensorflow/examples/label_image/BUILD
index c50fd93d039..cc73163f3b5 100644
--- a/tensorflow/examples/label_image/BUILD
+++ b/tensorflow/examples/label_image/BUILD
@@ -57,6 +57,7 @@ py_binary(
     name = "label_image_py",
     srcs = ["label_image.py"],
     main = "label_image.py",
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
diff --git a/tensorflow/examples/learn/BUILD b/tensorflow/examples/learn/BUILD
index a22d55e5af7..d98fe96f47a 100644
--- a/tensorflow/examples/learn/BUILD
+++ b/tensorflow/examples/learn/BUILD
@@ -12,6 +12,7 @@ exports_files(["LICENSE"])
 py_binary(
     name = "iris_custom_decay_dnn",
     srcs = ["iris_custom_decay_dnn.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = ["//tensorflow:tensorflow_py"],
 )
@@ -19,6 +20,7 @@ py_binary(
 py_binary(
     name = "iris_custom_model",
     srcs = ["iris_custom_model.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = ["//tensorflow:tensorflow_py"],
 )
diff --git a/tensorflow/examples/saved_model/integration_tests/BUILD b/tensorflow/examples/saved_model/integration_tests/BUILD
index 381250279d0..5ade3c2dbea 100644
--- a/tensorflow/examples/saved_model/integration_tests/BUILD
+++ b/tensorflow/examples/saved_model/integration_tests/BUILD
@@ -2,18 +2,16 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
-# This target bundles many scripts into a single py_binary so they can be
-# executed by saved_model_test without exploding the data dependencies.
-py_binary(
-    name = "run_script",
+py_library(
+    name = "integration_scripts",
     srcs = [
         "export_mnist_cnn.py",
         "export_rnn_cell.py",
         "export_simple_text_embedding.py",
         "export_text_rnn_model.py",
-        "run_script.py",
+        "integration_scripts.py",
         "use_mnist_cnn.py",
         "use_model_in_sequential_keras.py",
         "use_rnn_cell.py",
@@ -23,16 +21,6 @@ py_binary(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":mnist_util",
-        ":util",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-py_library(
-    name = "util",
-    srcs = ["util.py"],
-    visibility = ["//tensorflow:internal"],
-    deps = [
         "//tensorflow:tensorflow_py",
     ],
 )
@@ -46,25 +34,32 @@ py_library(
     ],
 )
 
-py_test(
+cuda_py_test(
     name = "saved_model_test",
     srcs = [
         "saved_model_test.py",
     ],
-    data = [
-        ":run_script",
+    additional_deps = [
+        ":integration_scripts",
+        "//tensorflow:tensorflow_py",
     ],
     shard_count = 4,
-    srcs_version = "PY2AND3",
     tags = [
-        # NOTE: Split SavedModelTest due to Forge input size limit.
-        "no_cuda_on_cpu_tap",  # forge input size exceeded
+        "no_pip",  # b/131697937 and b/132196869
         "noasan",  # forge input size exceeded
         "nomsan",  # forge input size exceeded
         "notsan",  # forge input size exceeded
-        "no_pip",  # b/131697937
-    ],
-    deps = [
-        "//tensorflow:tensorflow_py",
     ],
 )
+
+# b/132234211: Target added to support internal test target that runs the test
+# in an environment that has the extra dependencies required to test integration
+# with non core tensorflow packages.
+py_library(
+    name = "saved_model_test_lib",
+    srcs = [
+        "saved_model_test.py",
+    ],
+    visibility = ["//tensorflow:internal"],
+    deps = [":integration_scripts"],
+)
diff --git a/tensorflow/examples/saved_model/integration_tests/integration_scripts.py b/tensorflow/examples/saved_model/integration_tests/integration_scripts.py
new file mode 100644
index 00000000000..0db91facd65
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/integration_scripts.py
@@ -0,0 +1,65 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility to write SavedModel integration tests.
+
+SavedModel testing requires isolation between the process that creates and
+consumes it. This file helps doing that by relaunching the same binary that
+calls `assertCommandSucceeded` with an environment flag indicating what source
+file to execute. That binary must start by calling `MaybeRunScriptInstead`.
+
+This allows to wire this into existing building systems without having to depend
+on data dependencies. And as so allow to keep a fixed binary size and allows
+interop with GPU tests.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import importlib
+import os
+import subprocess
+import sys
+
+from absl import app
+import tensorflow.compat.v2 as tf
+
+from tensorflow.python.platform import tf_logging as logging
+
+
+class TestCase(tf.test.TestCase):
+  """Base class to write SavedModel integration tests."""
+
+  def assertCommandSucceeded(self, script_name, **flags):
+    """Runs an integration test script with given flags."""
+    run_script = sys.argv[0]
+    if run_script.endswith(".py"):
+      command_parts = [sys.executable, run_script]
+    else:
+      command_parts = [run_script]
+    for flag_key, flag_value in flags.items():
+      command_parts.append("--%s=%s" % (flag_key, flag_value))
+    env = dict(TF2_BEHAVIOR="enabled", SCRIPT_NAME=script_name)
+    logging.info("Running: %s with environment flags %s" % (command_parts, env))
+    subprocess.check_call(command_parts, env=dict(os.environ, **env))
+
+
+def MaybeRunScriptInstead():
+  if "SCRIPT_NAME" in os.environ:
+    # Append current path to import path and execute `SCRIPT_NAME` main.
+    sys.path.extend([os.path.dirname(__file__)])
+    module_name = os.environ["SCRIPT_NAME"]
+    retval = app.run(importlib.import_module(module_name).main)
+    sys.exit(retval)
diff --git a/tensorflow/examples/saved_model/integration_tests/saved_model_test.py b/tensorflow/examples/saved_model/integration_tests/saved_model_test.py
index b8e2019514d..7cc8fde6167 100644
--- a/tensorflow/examples/saved_model/integration_tests/saved_model_test.py
+++ b/tensorflow/examples/saved_model/integration_tests/saved_model_test.py
@@ -18,26 +18,27 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-import subprocess
-
 import tensorflow.compat.v2 as tf
 
-from tensorflow.python.platform import resource_loader
-from tensorflow.python.platform import tf_logging as logging
+from tensorflow.examples.saved_model.integration_tests import integration_scripts
 
 
-class SavedModelTest(tf.test.TestCase):
+class SavedModelTest(integration_scripts.TestCase):
 
-  def assertCommandSucceeded(self, script_name, **flags):
-    """Runs a test script via run_script."""
-    run_script = resource_loader.get_path_to_datafile("run_script")
-    command_parts = [run_script]
-    for flag_key, flag_value in flags.items():
-      command_parts.append("--%s=%s" % (flag_key, flag_value))
-    env = dict(TF2_BEHAVIOR="enabled", SCRIPT_NAME=script_name)
-    logging.info("Running: %s with environment flags %s" % (command_parts, env))
-    subprocess.check_call(command_parts, env=dict(os.environ, **env))
+  def __init__(self, method_name="runTest", has_extra_deps=False):
+    super(SavedModelTest, self).__init__(method_name)
+    self.has_extra_deps = has_extra_deps
+
+  def skipIfMissingExtraDeps(self):
+    """Skip test if it requires extra dependencies.
+
+    b/132234211: The extra dependencies are not available in all environments
+    that run the tests, e.g. "tensorflow_hub" is not available from tests
+    within "tensorflow" alone. Those tests are instead run by another
+    internal test target.
+    """
+    if not self.has_extra_deps:
+      self.skipTest("Missing extra dependencies")
 
   def test_text_rnn(self):
     export_dir = self.get_temp_dir()
@@ -50,6 +51,7 @@ class SavedModelTest(tf.test.TestCase):
     self.assertCommandSucceeded("use_rnn_cell", model_dir=export_dir)
 
   def test_text_embedding_in_sequential_keras(self):
+    self.skipIfMissingExtraDeps()
     export_dir = self.get_temp_dir()
     self.assertCommandSucceeded(
         "export_simple_text_embedding", export_dir=export_dir)
@@ -57,6 +59,9 @@ class SavedModelTest(tf.test.TestCase):
         "use_model_in_sequential_keras", model_dir=export_dir)
 
   def test_text_embedding_in_dataset(self):
+    if tf.test.is_gpu_available():
+      self.skipTest("b/132156097 - fails if there is a gpu available")
+
     export_dir = self.get_temp_dir()
     self.assertCommandSucceeded(
         "export_simple_text_embedding", export_dir=export_dir)
@@ -64,6 +69,7 @@ class SavedModelTest(tf.test.TestCase):
         "use_text_embedding_in_dataset", model_dir=export_dir)
 
   def test_mnist_cnn(self):
+    self.skipIfMissingExtraDeps()
     export_dir = self.get_temp_dir()
     self.assertCommandSucceeded(
         "export_mnist_cnn", export_dir=export_dir, fast_test_mode="true")
@@ -71,6 +77,7 @@ class SavedModelTest(tf.test.TestCase):
         "use_mnist_cnn", export_dir=export_dir, fast_test_mode="true")
 
   def test_mnist_cnn_with_mirrored_strategy(self):
+    self.skipIfMissingExtraDeps()
     self.skipTest(
         "b/129134185 - saved model and distribution strategy integration")
     export_dir = self.get_temp_dir()
@@ -85,5 +92,7 @@ class SavedModelTest(tf.test.TestCase):
         use_mirrored_strategy=True,
     )
 
+
 if __name__ == "__main__":
+  integration_scripts.MaybeRunScriptInstead()
   tf.test.main()
diff --git a/tensorflow/examples/saved_model/integration_tests/use_mnist_cnn.py b/tensorflow/examples/saved_model/integration_tests/use_mnist_cnn.py
index c08b5483f14..957091f0e86 100644
--- a/tensorflow/examples/saved_model/integration_tests/use_mnist_cnn.py
+++ b/tensorflow/examples/saved_model/integration_tests/use_mnist_cnn.py
@@ -29,9 +29,9 @@ from __future__ import print_function
 from absl import app
 from absl import flags
 import tensorflow.compat.v2 as tf
+import tensorflow_hub as hub
 
 from tensorflow.examples.saved_model.integration_tests import mnist_util
-from tensorflow.examples.saved_model.integration_tests import util
 
 FLAGS = flags.FLAGS
 
@@ -80,9 +80,7 @@ def make_feature_extractor(saved_model_path, trainable,
   if FLAGS.dropout_rate is not None:
     arguments['dropout_rate'] = FLAGS.dropout_rate
 
-  # CustomLayer mimics hub.KerasLayer because the tests are not able to depend
-  # on Hub at the moment.
-  return util.CustomLayer(obj, trainable=trainable, arguments=arguments)
+  return hub.KerasLayer(obj, trainable=trainable, arguments=arguments)
 
 
 def make_classifier(feature_extractor, l2_strength=0.01, dropout_rate=0.5):
diff --git a/tensorflow/examples/saved_model/integration_tests/use_model_in_sequential_keras.py b/tensorflow/examples/saved_model/integration_tests/use_model_in_sequential_keras.py
index c7a1a909548..2446ff91fb0 100644
--- a/tensorflow/examples/saved_model/integration_tests/use_model_in_sequential_keras.py
+++ b/tensorflow/examples/saved_model/integration_tests/use_model_in_sequential_keras.py
@@ -23,7 +23,7 @@ from absl import flags
 
 import numpy as np
 import tensorflow.compat.v2 as tf
-from tensorflow.examples.saved_model.integration_tests import util
+import tensorflow_hub as hub
 
 FLAGS = flags.FLAGS
 
@@ -42,7 +42,8 @@ def train(fine_tuning):
   l = tf.keras.layers
   model = tf.keras.Sequential()
   model.add(l.Reshape((), batch_input_shape=[None, 1], dtype=tf.string))
-  model.add(util.CustomLayer(module, output_shape=[10], trainable=fine_tuning))
+  # TODO(b/124219898): output_shape should be optional.
+  model.add(hub.KerasLayer(module, output_shape=[10], trainable=fine_tuning))
   model.add(l.Dense(100, activation="relu"))
   model.add(l.Dense(50, activation="relu"))
   model.add(l.Dense(1, activation="sigmoid"))
diff --git a/tensorflow/examples/saved_model/integration_tests/util.py b/tensorflow/examples/saved_model/integration_tests/util.py
deleted file mode 100644
index 1b709fdf98c..00000000000
--- a/tensorflow/examples/saved_model/integration_tests/util.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utilities for integration tests."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-
-import tensorflow.compat.v2 as tf
-
-from tensorflow.python.framework import smart_cond
-from tensorflow.python.util import tf_inspect
-
-
-# TODO(vbardiovsky): We should just reuse Keras's Lambda layer, when that
-# enables to get trainable variables.
-class CustomLayer(tf.keras.layers.Layer):
-  """Wraps callable object as a `Layer` object.
-
-  Args:
-    func: The callable object to wrap. Layer inputs are passed as the first
-      positional argument. If `func` accepts a `training` argument, a Python
-      boolean is passed for it.
-      If present, the following attributes of `func` have a special meaning:
-        * variables: a list of all tf.Variable objects that `func` depends on.
-        * trainable_variables: those elements of `variables` that are reported
-          as trainable variables of this Keras Layer.
-        * regularization_losses: a list of callables to be added as losses
-          of this Keras layer. Each one must accept zero arguments and return
-          a scalare tensor.
-    trainable: Boolean controlling whether the trainable variables of `func`
-      are reported as trainable variables of this layer.
-    arguments: optionally, a dict with additional keyword arguments passed
-      to `func`.
-    **kwargs: 'output_shape': A tuple with the (possibly partial) output
-      shape of the callable *without* leading batch size. Other arguments
-      are pass into the Layer constructor.
-  """
-
-  def __init__(self, func, trainable=False, arguments=None, **kwargs):
-    # Set self._{non,}_trainable_weights before calling Layer.__init__.
-    if hasattr(func, 'trainable_variables'):
-      self._trainable_weights = [v for v in func.trainable_variables]
-      trainable_variables_set = set(func.trainable_variables)
-    else:
-      self._trainable_weights = []
-      trainable_variables_set = set()
-    if hasattr(func, 'variables'):
-      self._non_trainable_weights = [v for v in func.variables
-                                     if v not in trainable_variables_set]
-    else:
-      self._non_trainable_weights = []  # TODO(arnoegw): Infer from `func`.
-
-    # TODO(b/124219898): We should be able to get the embedding dimension from
-    # the restored model.
-    if 'output_shape' in kwargs:
-      self._output_shape = tuple(kwargs.pop('output_shape'))
-
-    super(CustomLayer, self).__init__(trainable=trainable, **kwargs)
-    # Prepare to call `func`.
-    self._func = func
-    self._func_fullargspec = tf_inspect.getfullargspec(func.__call__)
-    self._func_wants_training = (
-        'training' in self._func_fullargspec.args or
-        'training' in self._func_fullargspec.kwonlyargs)
-    self._arguments = arguments or {}
-    # Forward the callable's regularization losses (if any).
-    if hasattr(func, 'regularization_losses'):
-      for l in func.regularization_losses:
-        if not callable(l):
-          raise ValueError(
-              'CustomLayer(func) expects func.regularization_losses to be an '
-              'iterable of callables, each returning a scalar loss term.')
-        self.add_loss(l)  # Supports callables.
-
-  def call(self, x, training=None):
-    # We basically want to call this...
-    f = functools.partial(self._func, x, **self._arguments)
-    # ...but we may also have to pass a Python boolean for `training`.
-    if not self._func_wants_training:
-      result = f()
-    else:
-      if training is None:
-        training = tf.keras.backend.learning_phase()  # Could be a tensor.
-      result = smart_cond.smart_cond(training,
-                                     lambda: f(training=True),
-                                     lambda: f(training=False))
-    # TODO(b/124219898): Polymorphic function should return shaped tensor.
-    if hasattr(self, '_output_shape'):
-      result.set_shape((x.shape[0],) + self._output_shape)
-    return result
diff --git a/tensorflow/examples/tutorials/layers/BUILD b/tensorflow/examples/tutorials/layers/BUILD
index aad78b18409..e4383d155b0 100644
--- a/tensorflow/examples/tutorials/layers/BUILD
+++ b/tensorflow/examples/tutorials/layers/BUILD
@@ -13,6 +13,7 @@ py_binary(
     srcs = [
         "cnn_mnist.py",
     ],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
diff --git a/tensorflow/examples/tutorials/mnist/BUILD b/tensorflow/examples/tutorials/mnist/BUILD
index 5f12374bdbd..6839c486144 100644
--- a/tensorflow/examples/tutorials/mnist/BUILD
+++ b/tensorflow/examples/tutorials/mnist/BUILD
@@ -50,6 +50,7 @@ py_binary(
     srcs = [
         "fully_connected_feed.py",
     ],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["optonly"],
     deps = [
@@ -64,6 +65,7 @@ py_binary(
     srcs = [
         "mnist_with_summaries.py",
     ],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":input_data",
@@ -82,6 +84,7 @@ py_binary(
     srcs = [
         "mnist_softmax_xla.py",
     ],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":input_data",
@@ -100,6 +103,7 @@ py_test(
         "--max_steps=10",
     ],
     main = "fully_connected_feed.py",
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":input_data",
@@ -120,6 +124,7 @@ py_test(
         "--learning_rate=0.00",
     ],
     main = "mnist_with_summaries.py",
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["notsan"],  # http://b/29184009
     deps = [
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 4ececd4c31f..8412a9f6213 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -38,65 +38,295 @@ func makeOutputList(op *tf.Operation, start int, output string) ([]tf.Output, in
 	return list, start + size, nil
 }
 
-// Applies sparse addition to `input` using individual values or slices
+// Generates fingerprint values.
 //
-// from `updates` according to indices `indices`.  The updates are non-aliasing:
-// `input` is only modified in-place if no other operations will use it.
-// Otherwise, a copy of `input` is made.  This operation has a gradient with
-// respect to both `input` and `updates`.
+// Generates fingerprint values of `data`.
 //
-// `input` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+// Fingerprint op considers the first dimension of `data` as the batch dimension,
+// and `output[i]` contains the fingerprint value generated from contents in
+// `data[i, ...]` for all `i`.
 //
-// `indices` must be integer tensor, containing indices into `input`.
-// It must be shape \\([d_0, ..., d_{Q-2}, K]\\) where `0 < K <= P`.
+// Fingerprint op writes fingerprint values as byte arrays. For example, the
+// default method `farmhash64` generates a 64-bit fingerprint value at a time.
+// This 8-byte value is written out as an `uint8` array of size 8, in little-endian
+// order.
 //
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or `(P-K)`-dimensional slices
-// (if `K < P`) along the `K`th dimension of `input`.
+// For example, suppose that `data` has data type `DT_INT32` and shape (2, 3, 4),
+// and that the fingerprint method is `farmhash64`. In this case, the output shape
+// is (2, 8), where 2 is the batch dimension size of `data`, and 8 is the size of
+// each fingerprint value in bytes. `output[0, :]` is generated from 12 integers in
+// `data[0, :, :]` and similarly `output[1, :]` is generated from other 12 integers
+// in `data[1, :, :]`.
 //
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+// Note that this op fingerprints the raw underlying buffer, and it does not
+// fingerprint Tensor's metadata such as data type and/or shape. For example, the
+// fingerprint values are invariant under reshapes and bitcasts as long as the
+// batch dimension remain the same:
 //
-// $$[d_0, ..., d_{Q-2}, input.shape[K], ..., input.shape[P-1]].$$
+// ```
+// Fingerprint(data) == Fingerprint(Reshape(data, ...))
+// Fingerprint(data) == Fingerprint(Bitcast(data, ...))
+// ```
 //
-// For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
-// elements. In Python, that addition would look like this:
-//
-//     input = tf.constant([1, 2, 3, 4, 5, 6, 7, 8])
-//     indices = tf.constant([[4], [3], [1], [7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     output = tf.scatter_nd_non_aliasing_add(input, indices, updates)
-//     with tf.Session() as sess:
-//       print(sess.run(output))
-//
-// The resulting value `output` would look like this:
-//
-//     [1, 13, 3, 14, 14, 6, 7, 20]
-//
-// See `tf.scatter_nd` for more details about how to make updates to slices.
+// For string data, one should expect `Fingerprint(data) !=
+// Fingerprint(ReduceJoin(data))` in general.
 //
 // Arguments:
-//	input: A Tensor.
-//	indices: A Tensor. Must be one of the following types: `int32`, `int64`.
-// A tensor of indices into `input`.
-//	updates: A Tensor. Must have the same type as ref. A tensor of updated values
-// to add to `input`.
+//	data: Must have rank 1 or higher.
+//	method: Fingerprint method used by this op. Currently available method is
+// `farmhash::fingerprint64`.
 //
-// Returns A `Tensor` with the same shape as `input`, containing values of `input`
-// updated with `updates`.
-func ScatterNdNonAliasingAdd(scope *Scope, input tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+// Returns A two-dimensional `Tensor` of type `tf.uint8`. The first dimension equals to
+// `data`'s first dimension, and the second dimension size depends on the
+// fingerprint algorithm.
+func Fingerprint(scope *Scope, data tf.Output, method tf.Output) (fingerprint tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ScatterNdNonAliasingAdd",
+		Type: "Fingerprint",
 		Input: []tf.Input{
-			input, indices, updates,
+			data, method,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
+// FakeQuantWithMinMaxVarsPerChannelGradientAttr is an optional argument to FakeQuantWithMinMaxVarsPerChannelGradient.
+type FakeQuantWithMinMaxVarsPerChannelGradientAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxVarsPerChannelGradientNumBits sets the optional num_bits attribute to value.
+//
+// value: The bitwidth of the quantization; between 2 and 16, inclusive.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsPerChannelGradientNumBits(value int64) FakeQuantWithMinMaxVarsPerChannelGradientAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxVarsPerChannelGradientNarrowRange sets the optional narrow_range attribute to value.
+//
+// value: Whether to quantize into 2^num_bits - 1 distinct values.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxVarsPerChannelGradientNarrowRange(value bool) FakeQuantWithMinMaxVarsPerChannelGradientAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Compute gradients for a FakeQuantWithMinMaxVarsPerChannel operation.
+//
+// Arguments:
+//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation,
+// shape one of: `[d]`, `[b, d]`,  `[b, h, w, d]`.
+//	inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation, shape
+//   same as `gradients`.
+// min, max: Quantization interval, floats of shape `[d]`.
+//
+//
+//
+// Returns Backpropagated gradients w.r.t. inputs, shape same as
+// `inputs`:
+//   `gradients * (inputs >= min && inputs <= max)`.Backpropagated gradients w.r.t. min parameter, shape `[d]`:
+// `sum_per_d(gradients * (inputs < min))`.Backpropagated gradients w.r.t. max parameter, shape `[d]`:
+// `sum_per_d(gradients * (inputs > max))`.
+func FakeQuantWithMinMaxVarsPerChannelGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsPerChannelGradientAttr) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FakeQuantWithMinMaxVarsPerChannelGradient",
+		Input: []tf.Input{
+			gradients, inputs, min, max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// FakeQuantWithMinMaxVarsGradientAttr is an optional argument to FakeQuantWithMinMaxVarsGradient.
+type FakeQuantWithMinMaxVarsGradientAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxVarsGradientNumBits sets the optional num_bits attribute to value.
+//
+// value: The bitwidth of the quantization; between 2 and 8, inclusive.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsGradientNumBits(value int64) FakeQuantWithMinMaxVarsGradientAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxVarsGradientNarrowRange sets the optional narrow_range attribute to value.
+//
+// value: Whether to quantize into 2^num_bits - 1 distinct values.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxVarsGradientNarrowRange(value bool) FakeQuantWithMinMaxVarsGradientAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Compute gradients for a FakeQuantWithMinMaxVars operation.
+//
+// Arguments:
+//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation.
+//	inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation.
+// min, max: Quantization interval, scalar floats.
+//
+//
+//
+// Returns Backpropagated gradients w.r.t. inputs:
+// `gradients * (inputs >= min && inputs <= max)`.Backpropagated gradients w.r.t. min parameter:
+// `sum(gradients * (inputs < min))`.Backpropagated gradients w.r.t. max parameter:
+// `sum(gradients * (inputs > max))`.
+func FakeQuantWithMinMaxVarsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsGradientAttr) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FakeQuantWithMinMaxVarsGradient",
+		Input: []tf.Input{
+			gradients, inputs, min, max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
+type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxVarsNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsNumBits(value int64) FakeQuantWithMinMaxVarsAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxVarsNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxVarsNarrowRange(value bool) FakeQuantWithMinMaxVarsAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
+//
+// and `max` to 'outputs' tensor of same shape as `inputs`.
+//
+// `[min; max]` define the clamping range for the `inputs` data.
+// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+// then de-quantized and output as floats in `[min; max]` interval.
+// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
+//
+// Before quantization, `min` and `max` values are adjusted with the following
+// logic.
+// It is suggested to have `min <= 0 <= max`. If `0` is not in the range of values,
+// the behavior can be unexpected:
+// If `0 < min < max`: `min_adj = 0` and `max_adj = max - min`.
+// If `min < max < 0`: `min_adj = min - max` and `max_adj = 0`.
+// If `min <= 0 <= max`: `scale = (max - min) / (2^num_bits - 1) `,
+// `min_adj = scale * round(min / scale)` and `max_adj = max + min_adj - min`.
+//
+// This operation has a gradient and thus allows for training `min` and `max`
+// values.
+func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsAttr) (outputs tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FakeQuantWithMinMaxVars",
+		Input: []tf.Input{
+			inputs, min, max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FakeQuantWithMinMaxArgsGradientAttr is an optional argument to FakeQuantWithMinMaxArgsGradient.
+type FakeQuantWithMinMaxArgsGradientAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxArgsGradientMin sets the optional min attribute to value.
+// If not specified, defaults to -6
+func FakeQuantWithMinMaxArgsGradientMin(value float32) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["min"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsGradientMax sets the optional max attribute to value.
+// If not specified, defaults to 6
+func FakeQuantWithMinMaxArgsGradientMax(value float32) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["max"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsGradientNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxArgsGradientNumBits(value int64) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsGradientNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxArgsGradientNarrowRange(value bool) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Compute gradients for a FakeQuantWithMinMaxArgs operation.
+//
+// Arguments:
+//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxArgs operation.
+//	inputs: Values passed as inputs to the FakeQuantWithMinMaxArgs operation.
+//
+// Returns Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:
+// `gradients * (inputs >= min && inputs <= max)`.
+func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsGradientAttr) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FakeQuantWithMinMaxArgsGradient",
+		Input: []tf.Input{
+			gradients, inputs,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Subtracts sparse `updates` from an existing tensor according to `indices`.
 //
 // This operation creates a new tensor by subtracting sparse `updates` from the
@@ -184,93 +414,6 @@ func TensorScatterSub(scope *Scope, tensor tf.Output, indices tf.Output, updates
 	return op.Output(0)
 }
 
-// Adds sparse `updates` to an existing tensor according to `indices`.
-//
-// This operation creates a new tensor by adding sparse `updates` to the passed
-// in `tensor`.
-// This operation is very similar to `tf.scatter_nd_add`, except that the updates
-// are added onto an existing tensor (as opposed to a variable). If the memory
-// for the existing tensor cannot be re-used, a copy is made and updated.
-//
-// `indices` is an integer tensor containing indices into a new tensor of shape
-// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
-//
-//     indices.shape[-1] <= shape.rank
-//
-// The last dimension of `indices` corresponds to indices into elements
-// (if `indices.shape[-1] = shape.rank`) or slices
-// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
-// `shape`.  `updates` is a tensor with shape
-//
-//     indices.shape[:-1] + shape[indices.shape[-1]:]
-//
-// The simplest form of tensor_scatter_add is to add individual elements to a
-// tensor by index. For example, say we want to add 4 elements in a rank-1
-// tensor with 8 elements.
-//
-// In Python, this scatter add operation would look like this:
-//
-// ```python
-//     indices = tf.constant([[4], [3], [1], [7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     tensor = tf.ones([8], dtype=tf.int32)
-//     updated = tf.tensor_scatter_add(tensor, indices, updates)
-//     with tf.Session() as sess:
-//       print(sess.run(scatter))
-// ```
-//
-// The resulting tensor would look like this:
-//
-//     [1, 12, 1, 11, 10, 1, 1, 13]
-//
-// We can also, insert entire slices of a higher rank tensor all at once. For
-// example, if we wanted to insert two slices in the first dimension of a
-// rank-3 tensor with two matrices of new values.
-//
-// In Python, this scatter add operation would look like this:
-//
-// ```python
-//     indices = tf.constant([[0], [2]])
-//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]],
-//                            [[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
-//     tensor = tf.ones([4, 4, 4])
-//     updated = tf.tensor_scatter_add(tensor, indices, updates)
-//     with tf.Session() as sess:
-//       print(sess.run(scatter))
-// ```
-//
-// The resulting tensor would look like this:
-//
-//     [[[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
-//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
-//      [[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
-//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
-//
-// Note that on CPU, if an out of bound index is found, an error is returned.
-// On GPU, if an out of bound index is found, the index is ignored.
-//
-// Arguments:
-//	tensor: Tensor to copy/update.
-//	indices: Index tensor.
-//	updates: Updates to scatter into output.
-//
-// Returns A new tensor copied from tensor and updates added according to the indices.
-func TensorScatterAdd(scope *Scope, tensor tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorScatterAdd",
-		Input: []tf.Input{
-			tensor, indices, updates,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Scatter `updates` into an existing tensor according to `indices`.
 //
 // This operation creates a new tensor by applying sparse `updates` to the passed
@@ -370,64 +513,6 @@ func TensorScatterUpdate(scope *Scope, tensor tf.Output, indices tf.Output, upda
 	return op.Output(0)
 }
 
-// LowerBoundAttr is an optional argument to LowerBound.
-type LowerBoundAttr func(optionalAttr)
-
-// LowerBoundOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func LowerBoundOutType(value tf.DataType) LowerBoundAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Applies lower_bound(sorted_search_values, values) along each row.
-//
-// Each set of rows with the same index in (sorted_inputs, values) is treated
-// independently.  The resulting row is the equivalent of calling
-// `np.searchsorted(sorted_inputs, values, side='left')`.
-//
-// The result is not a global index to the entire
-// `Tensor`, but rather just the index in the last dimension.
-//
-// A 2-D example:
-//   sorted_sequence = [[0, 3, 9, 9, 10],
-//                      [1, 2, 3, 4, 5]]
-//   values = [[2, 4, 9],
-//             [0, 2, 6]]
-//
-//   result = LowerBound(sorted_sequence, values)
-//
-//   result == [[1, 2, 2],
-//              [0, 1, 5]]
-//
-// Arguments:
-//	sorted_inputs: 2-D Tensor where each row is ordered.
-//	values: 2-D Tensor with the same numbers of rows as `sorted_search_values`. Contains
-// the values that will be searched for in `sorted_search_values`.
-//
-// Returns A `Tensor` with the same shape as `values`.  It contains the first scalar index
-// into the last dimension where values can be inserted without changing the
-// ordered property.
-func LowerBound(scope *Scope, sorted_inputs tf.Output, values tf.Output, optional ...LowerBoundAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LowerBound",
-		Input: []tf.Input{
-			sorted_inputs, values,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // UpperBoundAttr is an optional argument to UpperBound.
 type UpperBoundAttr func(optionalAttr)
 
@@ -714,6 +799,48 @@ func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
+type QuantizeAndDequantizeV3Attr func(optionalAttr)
+
+// QuantizeAndDequantizeV3SignedInput sets the optional signed_input attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV3SignedInput(value bool) QuantizeAndDequantizeV3Attr {
+	return func(m optionalAttr) {
+		m["signed_input"] = value
+	}
+}
+
+// QuantizeAndDequantizeV3RangeGiven sets the optional range_given attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV3RangeGiven(value bool) QuantizeAndDequantizeV3Attr {
+	return func(m optionalAttr) {
+		m["range_given"] = value
+	}
+}
+
+// Quantizes then dequantizes a tensor.
+//
+// This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
+// tensor, so its value can change during training.
+func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, num_bits tf.Output, optional ...QuantizeAndDequantizeV3Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizeAndDequantizeV3",
+		Input: []tf.Input{
+			input, input_min, input_max, num_bits,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // QuantizeAndDequantizeV2Attr is an optional argument to QuantizeAndDequantizeV2.
 type QuantizeAndDequantizeV2Attr func(optionalAttr)
 
@@ -847,71 +974,6 @@ func QuantizeAndDequantizeV2(scope *Scope, input tf.Output, input_min tf.Output,
 	return op.Output(0)
 }
 
-// QuantizeAndDequantizeAttr is an optional argument to QuantizeAndDequantize.
-type QuantizeAndDequantizeAttr func(optionalAttr)
-
-// QuantizeAndDequantizeSignedInput sets the optional signed_input attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeSignedInput(value bool) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["signed_input"] = value
-	}
-}
-
-// QuantizeAndDequantizeNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func QuantizeAndDequantizeNumBits(value int64) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// QuantizeAndDequantizeRangeGiven sets the optional range_given attribute to value.
-// If not specified, defaults to false
-func QuantizeAndDequantizeRangeGiven(value bool) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["range_given"] = value
-	}
-}
-
-// QuantizeAndDequantizeInputMin sets the optional input_min attribute to value.
-// If not specified, defaults to 0
-func QuantizeAndDequantizeInputMin(value float32) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["input_min"] = value
-	}
-}
-
-// QuantizeAndDequantizeInputMax sets the optional input_max attribute to value.
-// If not specified, defaults to 0
-func QuantizeAndDequantizeInputMax(value float32) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["input_max"] = value
-	}
-}
-
-// Use QuantizeAndDequantizeV2 instead.
-//
-// DEPRECATED at GraphDef version 22: Replaced by QuantizeAndDequantizeV2
-func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAndDequantizeAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantize",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // OneHotAttr is an optional argument to OneHot.
 type OneHotAttr func(optionalAttr)
 
@@ -1145,131 +1207,6 @@ func ExtractVolumePatches(scope *Scope, input tf.Output, ksizes []int64, strides
 	return op.Output(0)
 }
 
-// DepthToSpaceAttr is an optional argument to DepthToSpace.
-type DepthToSpaceAttr func(optionalAttr)
-
-// DepthToSpaceDataFormat sets the optional data_format attribute to value.
-// If not specified, defaults to "NHWC"
-func DepthToSpaceDataFormat(value string) DepthToSpaceAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// DepthToSpace for tensors of type T.
-//
-// Rearranges data from depth into blocks of spatial data.
-// This is the reverse transformation of SpaceToDepth. More specifically,
-// this op outputs a copy of the input tensor where values from the `depth`
-// dimension are moved in spatial blocks to the `height` and `width` dimensions.
-// The attr `block_size` indicates the input block size and how the data is moved.
-//
-//   * Chunks of data of size `block_size * block_size` from depth are rearranged
-//     into non-overlapping blocks of size `block_size x block_size`
-//   * The width the output tensor is `input_depth * block_size`, whereas the
-//     height is `input_height * block_size`.
-//   * The Y, X coordinates within each block of the output image are determined
-//     by the high order component of the input channel index.
-//   * The depth of the input tensor must be divisible by
-//     `block_size * block_size`.
-//
-// The `data_format` attr specifies the layout of the input and output tensors
-// with the following options:
-//   "NHWC": `[ batch, height, width, channels ]`
-//   "NCHW": `[ batch, channels, height, width ]`
-//   "NCHW_VECT_C":
-//       `qint8 [ batch, channels / 4, height, width, 4 ]`
-//
-// It is useful to consider the operation as transforming a 6-D Tensor.
-// e.g. for data_format = NHWC,
-//      Each element in the input tensor can be specified via 6 coordinates,
-//      ordered by decreasing memory layout significance as:
-//      n,iY,iX,bY,bX,oC  (where n=batch index, iX, iY means X or Y coordinates
-//                         within the input image, bX, bY means coordinates
-//                         within the output block, oC means output channels).
-//      The output would be the input transposed to the following layout:
-//      n,iY,bY,iX,bX,oC
-//
-// This operation is useful for resizing the activations between convolutions
-// (but keeping all data), e.g. instead of pooling. It is also useful for training
-// purely convolutional models.
-//
-// For example, given an input of shape `[1, 1, 1, 4]`, data_format = "NHWC" and
-// block_size = 2:
-//
-// ```
-// x = [[[[1, 2, 3, 4]]]]
-//
-// ```
-//
-// This operation will output a tensor of shape `[1, 2, 2, 1]`:
-//
-// ```
-//    [[[[1], [2]],
-//      [[3], [4]]]]
-// ```
-//
-// Here, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,
-// the corresponding output will have 2x2 elements and will have a depth of
-// 1 channel (1 = `4 / (block_size * block_size)`).
-// The output element shape is `[2, 2, 1]`.
-//
-// For an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.
-//
-// ```
-// x = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
-// ```
-//
-// This operation, for block size of 2, will return the following tensor of shape
-// `[1, 2, 2, 3]`
-//
-// ```
-//    [[[[1, 2, 3], [4, 5, 6]],
-//      [[7, 8, 9], [10, 11, 12]]]]
-//
-// ```
-//
-// Similarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:
-//
-// ```
-// x =  [[[[1, 2, 3, 4],
-//        [5, 6, 7, 8]],
-//       [[9, 10, 11, 12],
-//        [13, 14, 15, 16]]]]
-// ```
-//
-// the operator will return the following tensor of shape `[1 4 4 1]`:
-//
-// ```
-// x = [[[ [1],   [2],  [5],  [6]],
-//       [ [3],   [4],  [7],  [8]],
-//       [ [9],  [10], [13],  [14]],
-//       [ [11], [12], [15],  [16]]]]
-//
-// ```
-//
-// Arguments:
-//
-//	block_size: The size of the spatial block, same as in Space2Depth.
-func DepthToSpace(scope *Scope, input tf.Output, block_size int64, optional ...DepthToSpaceAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"block_size": block_size}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DepthToSpace",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // BatchToSpace for 4-D tensors of type T.
 //
 // This is a legacy version of the more general BatchToSpaceND.
@@ -1511,6 +1448,143 @@ func BatchToSpaceND(scope *Scope, input tf.Output, block_shape tf.Output, crops
 	return op.Output(0)
 }
 
+// SpaceToBatch for N-D tensors of type T.
+//
+// This operation divides "spatial" dimensions `[1, ..., M]` of the input into a
+// grid of blocks of shape `block_shape`, and interleaves these blocks with the
+// "batch" dimension (0) such that in the output, the spatial dimensions
+// `[1, ..., M]` correspond to the position within the grid, and the batch
+// dimension combines both the position within a spatial block and the original
+// batch position.  Prior to division into blocks, the spatial dimensions of the
+// input are optionally zero padded according to `paddings`.  See below for a
+// precise description.
+//
+// Arguments:
+//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
+// where spatial_shape has `M` dimensions.
+//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
+//	paddings: 2-D with shape `[M, 2]`, all values must be >= 0.
+//   `paddings[i] = [pad_start, pad_end]` specifies the padding for input dimension
+//   `i + 1`, which corresponds to spatial dimension `i`.  It is required that
+//   `block_shape[i]` divides `input_shape[i + 1] + pad_start + pad_end`.
+//
+// This operation is equivalent to the following steps:
+//
+// 1. Zero-pad the start and end of dimensions `[1, ..., M]` of the
+//    input according to `paddings` to produce `padded` of shape `padded_shape`.
+//
+// 2. Reshape `padded` to `reshaped_padded` of shape:
+//
+//      [batch] +
+//      [padded_shape[1] / block_shape[0],
+//        block_shape[0],
+//       ...,
+//       padded_shape[M] / block_shape[M-1],
+//       block_shape[M-1]] +
+//      remaining_shape
+//
+// 3. Permute dimensions of `reshaped_padded` to produce
+//    `permuted_reshaped_padded` of shape:
+//
+//      block_shape +
+//      [batch] +
+//      [padded_shape[1] / block_shape[0],
+//       ...,
+//       padded_shape[M] / block_shape[M-1]] +
+//      remaining_shape
+//
+// 4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the batch
+//    dimension, producing an output tensor of shape:
+//
+//      [batch * prod(block_shape)] +
+//      [padded_shape[1] / block_shape[0],
+//       ...,
+//       padded_shape[M] / block_shape[M-1]] +
+//      remaining_shape
+//
+// Some examples:
+//
+// (1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and
+//     `paddings = [[0, 0], [0, 0]]`:
+//
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
+//
+// The output tensor has shape `[4, 1, 1, 1]` and value:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// (2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and
+//     `paddings = [[0, 0], [0, 0]]`:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// The output tensor has shape `[4, 1, 1, 3]` and value:
+//
+// ```
+// [[[[1, 2, 3]]], [[[4, 5, 6]]], [[[7, 8, 9]]], [[[10, 11, 12]]]]
+// ```
+//
+// (3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and
+//     `paddings = [[0, 0], [0, 0]]`:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]],
+//       [[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+//
+// The output tensor has shape `[4, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// (4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and
+//     paddings = `[[0, 0], [2, 0]]`:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]]],
+//      [[[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+//
+// The output tensor has shape `[8, 1, 3, 1]` and value:
+//
+// ```
+// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
+//      [[[0], [2], [4]]], [[[0], [10], [12]]],
+//      [[[0], [5], [7]]], [[[0], [13], [15]]],
+//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
+// ```
+//
+// Among others, this operation is useful for reducing atrous convolution into
+// regular convolution.
+func SpaceToBatchND(scope *Scope, input tf.Output, block_shape tf.Output, paddings tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SpaceToBatchND",
+		Input: []tf.Input{
+			input, block_shape, paddings,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ListDiffAttr is an optional argument to ListDiff.
 type ListDiffAttr func(optionalAttr)
 
@@ -1570,146 +1644,6 @@ func ListDiff(scope *Scope, x tf.Output, y tf.Output, optional ...ListDiffAttr)
 	return op.Output(0), op.Output(1)
 }
 
-// SqueezeAttr is an optional argument to Squeeze.
-type SqueezeAttr func(optionalAttr)
-
-// SqueezeAxis sets the optional axis attribute to value.
-//
-// value: If specified, only squeezes the dimensions listed. The dimension
-// index starts at 0. It is an error to squeeze a dimension that is not 1. Must
-// be in the range `[-rank(input), rank(input))`.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func SqueezeAxis(value []int64) SqueezeAttr {
-	return func(m optionalAttr) {
-		m["squeeze_dims"] = value
-	}
-}
-
-// Removes dimensions of size 1 from the shape of a tensor.
-//
-// Given a tensor `input`, this operation returns a tensor of the same type with
-// all dimensions of size 1 removed. If you don't want to remove all size 1
-// dimensions, you can remove specific size 1 dimensions by specifying
-// `axis`.
-//
-// For example:
-//
-// ```
-// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-// shape(squeeze(t)) ==> [2, 3]
-// ```
-//
-// Or, to remove specific size 1 dimensions:
-//
-// ```
-// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-// shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
-// ```
-//
-// Arguments:
-//	input: The `input` to squeeze.
-//
-// Returns Contains the same data as `input`, but has one or more dimensions of
-// size 1 removed.
-func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Squeeze",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Inserts a dimension of 1 into a tensor's shape.
-//
-// Given a tensor `input`, this operation inserts a dimension of 1 at the
-// dimension index `axis` of `input`'s shape. The dimension index `axis` starts at
-// zero; if you specify a negative number for `axis` it is counted backward from
-// the end.
-//
-// This operation is useful if you want to add a batch dimension to a single
-// element. For example, if you have a single image of shape `[height, width,
-// channels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,
-// which will make the shape `[1, height, width, channels]`.
-//
-// Other examples:
-//
-// ```
-// # 't' is a tensor of shape [2]
-// shape(expand_dims(t, 0)) ==> [1, 2]
-// shape(expand_dims(t, 1)) ==> [2, 1]
-// shape(expand_dims(t, -1)) ==> [2, 1]
-//
-// # 't2' is a tensor of shape [2, 3, 5]
-// shape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]
-// shape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]
-// shape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]
-// ```
-//
-// This operation requires that:
-//
-// `-1-input.dims() <= dim <= input.dims()`
-//
-// This operation is related to `squeeze()`, which removes dimensions of
-// size 1.
-//
-// Arguments:
-//
-//	axis: 0-D (scalar). Specifies the dimension index at which to
-// expand the shape of `input`. Must be in the range
-// `[-rank(input) - 1, rank(input)]`.
-//
-// Returns Contains the same data as `input`, but its shape has an additional
-// dimension of size 1 added.
-func ExpandDims(scope *Scope, input tf.Output, axis tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ExpandDims",
-		Input: []tf.Input{
-			input, axis,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// A placeholder op that passes through `input` when its output is not fed.
-//
-// Arguments:
-//	input: The default value to produce when `output` is not fed.
-//	shape: The (possibly partial) shape of the tensor.
-//
-// Returns A placeholder tensor that defaults to `input` if it is not fed.
-func PlaceholderWithDefault(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shape": shape}
-	opspec := tf.OpSpec{
-		Type: "PlaceholderWithDefault",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // A placeholder op for a value that will be fed into the computation.
 //
 // DEPRECATED at GraphDef version 23: Placeholder now behaves the same as PlaceholderV2.
@@ -1738,181 +1672,6 @@ func PlaceholderV2(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.O
 	return op.Output(0)
 }
 
-// PlaceholderAttr is an optional argument to Placeholder.
-type PlaceholderAttr func(optionalAttr)
-
-// PlaceholderShape sets the optional shape attribute to value.
-//
-// value: (Optional) The shape of the tensor. If the shape has 0 dimensions, the
-// shape is unconstrained.
-// If not specified, defaults to <unknown_rank:true >
-func PlaceholderShape(value tf.Shape) PlaceholderAttr {
-	return func(m optionalAttr) {
-		m["shape"] = value
-	}
-}
-
-// A placeholder op for a value that will be fed into the computation.
-//
-// N.B. This operation will fail with an error if it is executed. It is
-// intended as a way to represent a value that will always be fed, and to
-// provide attrs that enable the fed value to be checked at runtime.
-//
-// Arguments:
-//	dtype: The type of elements in the tensor.
-//
-// Returns A placeholder tensor that must be replaced using the feed mechanism.
-func Placeholder(scope *Scope, dtype tf.DataType, optional ...PlaceholderAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Placeholder",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Pads a tensor with mirrored values.
-//
-// This operation pads a `input` with mirrored values according to the `paddings`
-// you specify. `paddings` is an integer tensor with shape `[n, 2]`, where n is
-// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many values to add before the contents of `input` in that dimension, and
-// `paddings[D, 1]` indicates how many values to add after the contents of `input`
-// in that dimension. Both `paddings[D, 0]` and `paddings[D, 1]` must be no greater
-// than `input.dim_size(D)` (or `input.dim_size(D) - 1`) if `copy_border` is true
-// (if false, respectively).
-//
-// The padded size of each dimension D of the output is:
-//
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-//
-// For example:
-//
-// ```
-// # 't' is [[1, 2, 3], [4, 5, 6]].
-// # 'paddings' is [[1, 1]], [2, 2]].
-// # 'mode' is SYMMETRIC.
-// # rank of 't' is 2.
-// pad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]
-//                       [2, 1, 1, 2, 3, 3, 2]
-//                       [5, 4, 4, 5, 6, 6, 5]
-//                       [5, 4, 4, 5, 6, 6, 5]]
-// ```
-//
-// Arguments:
-//	input: The input tensor to be padded.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	mode: Either `REFLECT` or `SYMMETRIC`. In reflect mode the padded regions
-// do not include the borders, while in symmetric mode the padded regions
-// do include the borders. For example, if `input` is `[1, 2, 3]` and `paddings`
-// is `[0, 2]`, then the output is `[1, 2, 3, 2, 1]` in reflect mode, and
-// it is `[1, 2, 3, 3, 2]` in symmetric mode.
-//
-// Returns The padded tensor.
-func MirrorPad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"mode": mode}
-	opspec := tf.OpSpec{
-		Type: "MirrorPad",
-		Input: []tf.Input{
-			input, paddings,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Pads a tensor.
-//
-// This operation pads `input` according to the `paddings` and `constant_values`
-// you specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is
-// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many padding values to add before the contents of `input` in that dimension,
-// and `paddings[D, 1]` indicates how many padding values to add after the contents
-// of `input` in that dimension. `constant_values` is a scalar tensor of the same
-// type as `input` that indicates the value to use for padding `input`.
-//
-// The padded size of each dimension D of the output is:
-//
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-//
-// For example:
-//
-// ```
-// # 't' is [[1, 1], [2, 2]]
-// # 'paddings' is [[1, 1], [2, 2]]
-// # 'constant_values' is 0
-// # rank of 't' is 2
-// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-//                       [0, 0, 1, 1, 0, 0]
-//                       [0, 0, 2, 2, 0, 0]
-//                       [0, 0, 0, 0, 0, 0]]
-// ```
-func PadV2(scope *Scope, input tf.Output, paddings tf.Output, constant_values tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "PadV2",
-		Input: []tf.Input{
-			input, paddings, constant_values,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Pads a tensor with zeros.
-//
-// This operation pads a `input` with zeros according to the `paddings` you
-// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
-// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many zeros to add before the contents of `input` in that dimension, and
-// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
-// in that dimension.
-//
-// The padded size of each dimension D of the output is:
-//
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-//
-// For example:
-//
-// ```
-// # 't' is [[1, 1], [2, 2]]
-// # 'paddings' is [[1, 1], [2, 2]]
-// # rank of 't' is 2
-// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-//                       [0, 0, 1, 1, 0, 0]
-//                       [0, 0, 2, 2, 0, 0]
-//                       [0, 0, 0, 0, 0, 0]]
-// ```
-//
-func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Pad",
-		Input: []tf.Input{
-			input, paddings,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Return the reduction indices for computing gradients of s0 op s1 with broadcast.
 //
 // This is typically used by gradient computations for a broadcasting operation.
@@ -1930,45 +1689,6 @@ func BroadcastGradientArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Outp
 	return op.Output(0), op.Output(1)
 }
 
-// Return the shape of s0 op s1 with broadcast.
-//
-// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
-// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
-func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BroadcastArgs",
-		Input: []tf.Input{
-			s0, s1,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the gradient of `Tile`.
-//
-// DEPRECATED at GraphDef version 3: TileGrad has been replaced with reduce_sum
-//
-// Since `Tile` takes an input and repeats the input `multiples` times
-// along each dimension, `TileGrad` takes in `multiples` and aggregates
-// each repeated tile of `input` into `output`.
-func TileGrad(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TileGrad",
-		Input: []tf.Input{
-			input, multiples,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Constructs a tensor by tiling a given tensor.
 //
 // This operation creates a new tensor by replicating `input` `multiples` times.
@@ -2064,6 +1784,149 @@ func TensorStridedSliceUpdate(scope *Scope, input tf.Output, begin tf.Output, en
 	return op.Output(0)
 }
 
+// ResourceStridedSliceAssignAttr is an optional argument to ResourceStridedSliceAssign.
+type ResourceStridedSliceAssignAttr func(optionalAttr)
+
+// ResourceStridedSliceAssignBeginMask sets the optional begin_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignBeginMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["begin_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignEndMask sets the optional end_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignEndMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["end_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignEllipsisMask sets the optional ellipsis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignEllipsisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["ellipsis_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignNewAxisMask sets the optional new_axis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignNewAxisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["new_axis_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignShrinkAxisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["shrink_axis_mask"] = value
+	}
+}
+
+// Assign `value` to the sliced l-value reference of `ref`.
+//
+// The values of `value` are assigned to the positions in the variable
+// `ref` that are selected by the slice parameters. The slice parameters
+// `begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
+//
+// NOTE this op currently does not support broadcasting and so `value`'s
+// shape must be exactly the shape produced by the slice of `ref`.
+//
+// Returns the created operation.
+func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, end tf.Output, strides tf.Output, value tf.Output, optional ...ResourceStridedSliceAssignAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceStridedSliceAssign",
+		Input: []tf.Input{
+			ref, begin, end, strides, value,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// StridedSliceGradAttr is an optional argument to StridedSliceGrad.
+type StridedSliceGradAttr func(optionalAttr)
+
+// StridedSliceGradBeginMask sets the optional begin_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradBeginMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["begin_mask"] = value
+	}
+}
+
+// StridedSliceGradEndMask sets the optional end_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradEndMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["end_mask"] = value
+	}
+}
+
+// StridedSliceGradEllipsisMask sets the optional ellipsis_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradEllipsisMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["ellipsis_mask"] = value
+	}
+}
+
+// StridedSliceGradNewAxisMask sets the optional new_axis_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradNewAxisMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["new_axis_mask"] = value
+	}
+}
+
+// StridedSliceGradShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradShrinkAxisMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["shrink_axis_mask"] = value
+	}
+}
+
+// Returns the gradient of `StridedSlice`.
+//
+// Since `StridedSlice` cuts out pieces of its `input` which is size
+// `shape`, its gradient will have the same shape (which is passed here
+// as `shape`). The gradient will be zero in any element that the slice
+// does not select.
+//
+// Arguments are the same as StridedSliceGrad with the exception that
+// `dy` is the input gradient to be propagated and `shape` is the
+// shape of `StridedSlice`'s `input`.
+func StridedSliceGrad(scope *Scope, shape tf.Output, begin tf.Output, end tf.Output, strides tf.Output, dy tf.Output, optional ...StridedSliceGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StridedSliceGrad",
+		Input: []tf.Input{
+			shape, begin, end, strides, dy,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // StridedSliceAttr is an optional argument to StridedSlice.
 type StridedSliceAttr func(optionalAttr)
 
@@ -2257,71 +2120,124 @@ func StridedSlice(scope *Scope, input tf.Output, begin tf.Output, end tf.Output,
 	return op.Output(0)
 }
 
-// Return a slice from 'input'.
+// Returns the rank of a tensor.
 //
-// The output tensor is a tensor with dimensions described by 'size'
-// whose values are extracted from 'input' starting at the offsets in
-// 'begin'.
+// This operation returns an integer representing the rank of `input`.
 //
-// *Requirements*:
-//   0 <= begin[i] <= begin[i] + size[i] <= Di  for i in [0, n)
+// For example:
 //
-// Arguments:
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// # shape of tensor 't' is [2, 2, 3]
+// rank(t) ==> 3
+// ```
 //
-//	begin: begin[i] specifies the offset into the 'i'th dimension of
-// 'input' to slice from.
-//	size: size[i] specifies the number of elements of the 'i'th dimension
-// of 'input' to slice. If size[i] is -1, all remaining elements in dimension
-// i are included in the slice (i.e. this is equivalent to setting
-// size[i] = input.dim_size(i) - begin[i]).
-func Slice(scope *Scope, input tf.Output, begin tf.Output, size tf.Output) (output tf.Output) {
+// **Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
+// of a tensor is the number of indices required to uniquely select each element
+// of the tensor. Rank is also known as "order", "degree", or "ndims."
+func Rank(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Slice",
+		Type: "Rank",
 		Input: []tf.Input{
-			input, begin, size,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SizeAttr is an optional argument to Size.
-type SizeAttr func(optionalAttr)
+// ReverseSequenceAttr is an optional argument to ReverseSequence.
+type ReverseSequenceAttr func(optionalAttr)
 
-// SizeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func SizeOutType(value tf.DataType) SizeAttr {
+// ReverseSequenceBatchDim sets the optional batch_dim attribute to value.
+//
+// value: The dimension along which reversal is performed.
+// If not specified, defaults to 0
+func ReverseSequenceBatchDim(value int64) ReverseSequenceAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["batch_dim"] = value
 	}
 }
 
-// Returns the size of a tensor.
+// Reverses variable length slices.
 //
-// This operation returns an integer representing the number of elements in
-// `input`.
+// This op first slices `input` along the dimension `batch_dim`, and for each
+// slice `i`, reverses the first `seq_lengths[i]` elements along
+// the dimension `seq_dim`.
+//
+// The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
+// and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
+//
+// The output slice `i` along dimension `batch_dim` is then given by input
+// slice `i`, with the first `seq_lengths[i]` slices along dimension
+// `seq_dim` reversed.
 //
 // For example:
 //
 // ```
-// # 't' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]
-// size(t) ==> 12
+// # Given this:
+// batch_dim = 0
+// seq_dim = 1
+// input.dims = (4, 8, ...)
+// seq_lengths = [7, 2, 3, 5]
+//
+// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
+// output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
+// output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
+// output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
+// output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
+//
+// # while entries past seq_lens are copied through:
+// output[0, 7:, :, ...] = input[0, 7:, :, ...]
+// output[1, 2:, :, ...] = input[1, 2:, :, ...]
+// output[2, 3:, :, ...] = input[2, 3:, :, ...]
+// output[3, 2:, :, ...] = input[3, 2:, :, ...]
 // ```
-func Size(scope *Scope, input tf.Output, optional ...SizeAttr) (output tf.Output) {
+//
+// In contrast, if:
+//
+// ```
+// # Given this:
+// batch_dim = 2
+// seq_dim = 0
+// input.dims = (8, ?, 4, ...)
+// seq_lengths = [7, 2, 3, 5]
+//
+// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
+// output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
+// output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
+// output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
+// output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
+//
+// # while entries past seq_lens are copied through:
+// output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
+// output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
+// output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
+// output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
+// ```
+//
+// Arguments:
+//	input: The input to reverse.
+//	seq_lengths: 1-D with length `input.dims(batch_dim)` and
+// `max(seq_lengths) <= input.dims(seq_dim)`
+//	seq_dim: The dimension which is partially reversed.
+//
+// Returns The partially reversed input. It has the same shape as `input`.
+func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_dim int64, optional ...ReverseSequenceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"seq_dim": seq_dim}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Size",
+		Type: "ReverseSequence",
 		Input: []tf.Input{
-			input,
+			input, seq_lengths,
 		},
 		Attrs: attrs,
 	}
@@ -2355,6 +2271,189 @@ func EnsureShape(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Outpu
 	return op.Output(0)
 }
 
+// DepthToSpaceAttr is an optional argument to DepthToSpace.
+type DepthToSpaceAttr func(optionalAttr)
+
+// DepthToSpaceDataFormat sets the optional data_format attribute to value.
+// If not specified, defaults to "NHWC"
+func DepthToSpaceDataFormat(value string) DepthToSpaceAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// DepthToSpace for tensors of type T.
+//
+// Rearranges data from depth into blocks of spatial data.
+// This is the reverse transformation of SpaceToDepth. More specifically,
+// this op outputs a copy of the input tensor where values from the `depth`
+// dimension are moved in spatial blocks to the `height` and `width` dimensions.
+// The attr `block_size` indicates the input block size and how the data is moved.
+//
+//   * Chunks of data of size `block_size * block_size` from depth are rearranged
+//     into non-overlapping blocks of size `block_size x block_size`
+//   * The width the output tensor is `input_depth * block_size`, whereas the
+//     height is `input_height * block_size`.
+//   * The Y, X coordinates within each block of the output image are determined
+//     by the high order component of the input channel index.
+//   * The depth of the input tensor must be divisible by
+//     `block_size * block_size`.
+//
+// The `data_format` attr specifies the layout of the input and output tensors
+// with the following options:
+//   "NHWC": `[ batch, height, width, channels ]`
+//   "NCHW": `[ batch, channels, height, width ]`
+//   "NCHW_VECT_C":
+//       `qint8 [ batch, channels / 4, height, width, 4 ]`
+//
+// It is useful to consider the operation as transforming a 6-D Tensor.
+// e.g. for data_format = NHWC,
+//      Each element in the input tensor can be specified via 6 coordinates,
+//      ordered by decreasing memory layout significance as:
+//      n,iY,iX,bY,bX,oC  (where n=batch index, iX, iY means X or Y coordinates
+//                         within the input image, bX, bY means coordinates
+//                         within the output block, oC means output channels).
+//      The output would be the input transposed to the following layout:
+//      n,iY,bY,iX,bX,oC
+//
+// This operation is useful for resizing the activations between convolutions
+// (but keeping all data), e.g. instead of pooling. It is also useful for training
+// purely convolutional models.
+//
+// For example, given an input of shape `[1, 1, 1, 4]`, data_format = "NHWC" and
+// block_size = 2:
+//
+// ```
+// x = [[[[1, 2, 3, 4]]]]
+//
+// ```
+//
+// This operation will output a tensor of shape `[1, 2, 2, 1]`:
+//
+// ```
+//    [[[[1], [2]],
+//      [[3], [4]]]]
+// ```
+//
+// Here, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,
+// the corresponding output will have 2x2 elements and will have a depth of
+// 1 channel (1 = `4 / (block_size * block_size)`).
+// The output element shape is `[2, 2, 1]`.
+//
+// For an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.
+//
+// ```
+// x = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
+// ```
+//
+// This operation, for block size of 2, will return the following tensor of shape
+// `[1, 2, 2, 3]`
+//
+// ```
+//    [[[[1, 2, 3], [4, 5, 6]],
+//      [[7, 8, 9], [10, 11, 12]]]]
+//
+// ```
+//
+// Similarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:
+//
+// ```
+// x =  [[[[1, 2, 3, 4],
+//        [5, 6, 7, 8]],
+//       [[9, 10, 11, 12],
+//        [13, 14, 15, 16]]]]
+// ```
+//
+// the operator will return the following tensor of shape `[1 4 4 1]`:
+//
+// ```
+// x = [[[ [1],   [2],  [5],  [6]],
+//       [ [3],   [4],  [7],  [8]],
+//       [ [9],  [10], [13],  [14]],
+//       [ [11], [12], [15],  [16]]]]
+//
+// ```
+//
+// Arguments:
+//
+//	block_size: The size of the spatial block, same as in Space2Depth.
+func DepthToSpace(scope *Scope, input tf.Output, block_size int64, optional ...DepthToSpaceAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"block_size": block_size}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DepthToSpace",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Return the shape of s0 op s1 with broadcast.
+//
+// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
+// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
+func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BroadcastArgs",
+		Input: []tf.Input{
+			s0, s1,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ShapeAttr is an optional argument to Shape.
+type ShapeAttr func(optionalAttr)
+
+// ShapeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func ShapeOutType(value tf.DataType) ShapeAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Returns the shape of a tensor.
+//
+// This operation returns a 1-D integer tensor representing the shape of `input`.
+//
+// For example:
+//
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// shape(t) ==> [2, 2, 3]
+// ```
+func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Shape",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // UniqueWithCountsV2Attr is an optional argument to UniqueWithCountsV2.
 type UniqueWithCountsV2Attr func(optionalAttr)
 
@@ -2725,46 +2824,23 @@ func Reshape(scope *Scope, tensor tf.Output, shape tf.Output) (output tf.Output)
 	return op.Output(0)
 }
 
-// PreventGradientAttr is an optional argument to PreventGradient.
-type PreventGradientAttr func(optionalAttr)
-
-// PreventGradientMessage sets the optional message attribute to value.
+// Checks a tensor for NaN and Inf values.
 //
-// value: Will be printed in the error when anyone tries to differentiate
-// this operation.
-// If not specified, defaults to ""
-func PreventGradientMessage(value string) PreventGradientAttr {
-	return func(m optionalAttr) {
-		m["message"] = value
-	}
-}
-
-// An identity op that triggers an error if a gradient is requested.
-//
-// When executed in a graph, this op outputs its input tensor as-is.
-//
-// When building ops to compute gradients, the TensorFlow gradient system
-// will return an error when trying to lookup the gradient of this op,
-// because no gradient must ever be registered for this function.  This
-// op exists to prevent subtle bugs from silently returning unimplemented
-// gradients in some corner cases.
+// When run, reports an `InvalidArgument` error if `tensor` has any values
+// that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
 //
 // Arguments:
-//	input: any tensor.
 //
-// Returns the same input tensor.
-func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientAttr) (output tf.Output) {
+//	message: Prefix of the error message.
+func CheckNumerics(scope *Scope, tensor tf.Output, message string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"message": message}
 	opspec := tf.OpSpec{
-		Type: "PreventGradient",
+		Type: "CheckNumerics",
 		Input: []tf.Input{
-			input,
+			tensor,
 		},
 		Attrs: attrs,
 	}
@@ -2896,128 +2972,73 @@ func Identity(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// Gather slices from `params` into a Tensor with shape specified by `indices`.
+// FakeQuantWithMinMaxArgsAttr is an optional argument to FakeQuantWithMinMaxArgs.
+type FakeQuantWithMinMaxArgsAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxArgsMin sets the optional min attribute to value.
+// If not specified, defaults to -6
+func FakeQuantWithMinMaxArgsMin(value float32) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["min"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsMax sets the optional max attribute to value.
+// If not specified, defaults to 6
+func FakeQuantWithMinMaxArgsMax(value float32) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["max"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxArgsNumBits(value int64) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxArgsNarrowRange(value bool) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.
 //
-// `indices` is an K-dimensional integer tensor, best thought of as a
-// (K-1)-dimensional tensor of indices into `params`, where each element defines a
-// slice of `params`:
+// Attributes `[min; max]` define the clamping range for the `inputs` data.
+// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+// then de-quantized and output as floats in `[min; max]` interval.
+// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
 //
-//     output[\\(i_0, ..., i_{K-2}\\)] = params[indices[\\(i_0, ..., i_{K-2}\\)]]
+// Before quantization, `min` and `max` values are adjusted with the following
+// logic.
+// It is suggested to have `min <= 0 <= max`. If `0` is not in the range of values,
+// the behavior can be unexpected:
+// If `0 < min < max`: `min_adj = 0` and `max_adj = max - min`.
+// If `min < max < 0`: `min_adj = min - max` and `max_adj = 0`.
+// If `min <= 0 <= max`: `scale = (max - min) / (2^num_bits - 1) `,
+// `min_adj = scale * round(min / scale)` and `max_adj = max + min_adj - min`.
 //
-// Whereas in `tf.gather` `indices` defines slices into the first
-// dimension of `params`, in `tf.gather_nd`, `indices` defines slices into the
-// first `N` dimensions of `params`, where `N = indices.shape[-1]`.
-//
-// The last dimension of `indices` can be at most the rank of
-// `params`:
-//
-//     indices.shape[-1] <= params.rank
-//
-// The last dimension of `indices` corresponds to elements
-// (if `indices.shape[-1] == params.rank`) or slices
-// (if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]`
-// of `params`.  The output tensor has shape
-//
-//     indices.shape[:-1] + params.shape[indices.shape[-1]:]
-//
-// Note that on CPU, if an out of bound index is found, an error is returned.
-// On GPU, if an out of bound index is found, a 0 is stored in the
-// corresponding output value.
-//
-// Some examples below.
-//
-// Simple indexing into a matrix:
-//
-// ```python
-//     indices = [[0, 0], [1, 1]]
-//     params = [['a', 'b'], ['c', 'd']]
-//     output = ['a', 'd']
-// ```
-//
-// Slice indexing into a matrix:
-//
-// ```python
-//     indices = [[1], [0]]
-//     params = [['a', 'b'], ['c', 'd']]
-//     output = [['c', 'd'], ['a', 'b']]
-// ```
-//
-// Indexing into a 3-tensor:
-//
-// ```python
-//     indices = [[1]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [[['a1', 'b1'], ['c1', 'd1']]]
-//
-//
-//     indices = [[0, 1], [1, 0]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [['c0', 'd0'], ['a1', 'b1']]
-//
-//
-//     indices = [[0, 0, 1], [1, 0, 1]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = ['b0', 'b1']
-// ```
-//
-// Batched indexing into a matrix:
-//
-// ```python
-//     indices = [[[0, 0]], [[0, 1]]]
-//     params = [['a', 'b'], ['c', 'd']]
-//     output = [['a'], ['b']]
-// ```
-//
-// Batched slice indexing into a matrix:
-//
-// ```python
-//     indices = [[[1]], [[0]]]
-//     params = [['a', 'b'], ['c', 'd']]
-//     output = [[['c', 'd']], [['a', 'b']]]
-// ```
-//
-// Batched indexing into a 3-tensor:
-//
-// ```python
-//     indices = [[[1]], [[0]]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [[[['a1', 'b1'], ['c1', 'd1']]],
-//               [[['a0', 'b0'], ['c0', 'd0']]]]
-//
-//     indices = [[[0, 1], [1, 0]], [[0, 0], [1, 1]]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [[['c0', 'd0'], ['a1', 'b1']],
-//               [['a0', 'b0'], ['c1', 'd1']]]
-//
-//
-//     indices = [[[0, 0, 1], [1, 0, 1]], [[0, 1, 1], [1, 1, 0]]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [['b0', 'b1'], ['d0', 'c1']]
-// ```
-//
-// See also `tf.gather` and `tf.batch_gather`.
-//
-// Arguments:
-//	params: The tensor from which to gather values.
-//	indices: Index tensor.
-//
-// Returns Values from `params` gathered from indices given by `indices`, with
-// shape `indices.shape[:-1] + params.shape[indices.shape[-1]:]`.
-func GatherNd(scope *Scope, params tf.Output, indices tf.Output) (output tf.Output) {
+// Quantization is called fake since the output is still in floating point.
+func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsAttr) (outputs tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "GatherNd",
+		Type: "FakeQuantWithMinMaxArgs",
 		Input: []tf.Input{
-			params, indices,
+			inputs,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -3076,60 +3097,164 @@ func GatherV2(scope *Scope, params tf.Output, indices tf.Output, axis tf.Output)
 	return op.Output(0)
 }
 
-// FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
-type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxVarsNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxVarsNumBits(value int64) FakeQuantWithMinMaxVarsAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// FakeQuantWithMinMaxVarsNarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxVarsNarrowRange(value bool) FakeQuantWithMinMaxVarsAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
-	}
-}
-
-// Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
+// Reverses specific dimensions of a tensor.
 //
-// and `max` to 'outputs' tensor of same shape as `inputs`.
+// Given a `tensor`, and a `bool` tensor `dims` representing the dimensions
+// of `tensor`, this operation reverses each dimension i of `tensor` where
+// `dims[i]` is `True`.
 //
-// `[min; max]` define the clamping range for the `inputs` data.
-// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
-// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
-// then de-quantized and output as floats in `[min; max]` interval.
-// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
+// `tensor` can have up to 8 dimensions. The number of dimensions
+// of `tensor` must equal the number of elements in `dims`. In other words:
 //
-// Before quantization, `min` and `max` values are adjusted with the following
-// logic.
-// It is suggested to have `min <= 0 <= max`. If `0` is not in the range of values,
-// the behavior can be unexpected:
-// If `0 < min < max`: `min_adj = 0` and `max_adj = max - min`.
-// If `min < max < 0`: `min_adj = min - max` and `max_adj = 0`.
-// If `min <= 0 <= max`: `scale = (max - min) / (2^num_bits - 1) `,
-// `min_adj = scale * round(min / scale)` and `max_adj = max + min_adj - min`.
+// `rank(tensor) = size(dims)`
 //
-// This operation has a gradient and thus allows for training `min` and `max`
-// values.
-func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsAttr) (outputs tf.Output) {
+// For example:
+//
+// ```
+// # tensor 't' is [[[[ 0,  1,  2,  3],
+// #                  [ 4,  5,  6,  7],
+// #                  [ 8,  9, 10, 11]],
+// #                 [[12, 13, 14, 15],
+// #                  [16, 17, 18, 19],
+// #                  [20, 21, 22, 23]]]]
+// # tensor 't' shape is [1, 2, 3, 4]
+//
+// # 'dims' is [False, False, False, True]
+// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
+//                         [ 7,  6,  5,  4],
+//                         [ 11, 10, 9, 8]],
+//                        [[15, 14, 13, 12],
+//                         [19, 18, 17, 16],
+//                         [23, 22, 21, 20]]]]
+//
+// # 'dims' is [False, True, False, False]
+// reverse(t, dims) ==> [[[[12, 13, 14, 15],
+//                         [16, 17, 18, 19],
+//                         [20, 21, 22, 23]
+//                        [[ 0,  1,  2,  3],
+//                         [ 4,  5,  6,  7],
+//                         [ 8,  9, 10, 11]]]]
+//
+// # 'dims' is [False, False, True, False]
+// reverse(t, dims) ==> [[[[8, 9, 10, 11],
+//                         [4, 5, 6, 7],
+//                         [0, 1, 2, 3]]
+//                        [[20, 21, 22, 23],
+//                         [16, 17, 18, 19],
+//                         [12, 13, 14, 15]]]]
+// ```
+//
+// Arguments:
+//	tensor: Up to 8-D.
+//	dims: 1-D. The dimensions to reverse.
+//
+// Returns The same shape as `tensor`.
+func Reverse(scope *Scope, tensor tf.Output, dims tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+	opspec := tf.OpSpec{
+		Type: "Reverse",
+		Input: []tf.Input{
+			tensor, dims,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the batched diagonal part of a batched tensor.
+//
+// This operation returns a tensor with the `diagonal` part
+// of the batched `input`. The `diagonal` part is computed as follows:
+//
+// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
+// tensor of rank `k - 1` with dimensions `[I, J, K, ..., min(M, N)]` where:
+//
+// `diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.
+//
+// The input must be at least a matrix.
+//
+// For example:
+//
+// ```
+// # 'input' is [[[1, 0, 0, 0]
+//                [0, 2, 0, 0]
+//                [0, 0, 3, 0]
+//                [0, 0, 0, 4]],
+//               [[5, 0, 0, 0]
+//                [0, 6, 0, 0]
+//                [0, 0, 7, 0]
+//                [0, 0, 0, 8]]]
+//
+// and input.shape = (2, 4, 4)
+//
+// tf.matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]
+//
+// which has shape (2, 4)
+// ```
+//
+// Arguments:
+//	input: Rank `k` tensor where `k >= 2`.
+//
+// Returns The extracted diagonal(s) having shape
+// `diagonal.shape = input.shape[:-2] + [min(input.shape[-2:])]`.
+func MatrixDiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVars",
+		Type: "MatrixDiagPart",
 		Input: []tf.Input{
-			inputs, min, max,
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns a batched diagonal tensor with a given batched diagonal values.
+//
+// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+// everything else padded with zeros. The diagonal is computed as follows:
+//
+// Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
+// tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
+//
+// `output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
+//
+// For example:
+//
+// ```
+// # 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
+//
+// and diagonal.shape = (2, 4)
+//
+// tf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
+//                                      [0, 2, 0, 0]
+//                                      [0, 0, 3, 0]
+//                                      [0, 0, 0, 4]],
+//                                     [[5, 0, 0, 0]
+//                                      [0, 6, 0, 0]
+//                                      [0, 0, 7, 0]
+//                                      [0, 0, 0, 8]]]
+//
+// which has shape (2, 4, 4)
+// ```
+//
+// Arguments:
+//	diagonal: Rank `k`, where `k >= 1`.
+//
+// Returns Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
+func MatrixDiag(scope *Scope, diagonal tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixDiag",
+		Input: []tf.Input{
+			diagonal,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -3154,75 +3279,6 @@ func ConjugateTranspose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output)
 	return op.Output(0)
 }
 
-// Reverses specific dimensions of a tensor.
-//
-// NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
-// `tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.
-//
-// Given a `tensor`, and a `int32` tensor `axis` representing the set of
-// dimensions of `tensor` to reverse. This operation reverses each dimension
-// `i` for which there exists `j` s.t. `axis[j] == i`.
-//
-// `tensor` can have up to 8 dimensions. The number of dimensions specified
-// in `axis` may be 0 or more entries. If an index is specified more than
-// once, a InvalidArgument error is raised.
-//
-// For example:
-//
-// ```
-// # tensor 't' is [[[[ 0,  1,  2,  3],
-// #                  [ 4,  5,  6,  7],
-// #                  [ 8,  9, 10, 11]],
-// #                 [[12, 13, 14, 15],
-// #                  [16, 17, 18, 19],
-// #                  [20, 21, 22, 23]]]]
-// # tensor 't' shape is [1, 2, 3, 4]
-//
-// # 'dims' is [3] or 'dims' is [-1]
-// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-//                         [ 7,  6,  5,  4],
-//                         [ 11, 10, 9, 8]],
-//                        [[15, 14, 13, 12],
-//                         [19, 18, 17, 16],
-//                         [23, 22, 21, 20]]]]
-//
-// # 'dims' is '[1]' (or 'dims' is '[-3]')
-// reverse(t, dims) ==> [[[[12, 13, 14, 15],
-//                         [16, 17, 18, 19],
-//                         [20, 21, 22, 23]
-//                        [[ 0,  1,  2,  3],
-//                         [ 4,  5,  6,  7],
-//                         [ 8,  9, 10, 11]]]]
-//
-// # 'dims' is '[2]' (or 'dims' is '[-2]')
-// reverse(t, dims) ==> [[[[8, 9, 10, 11],
-//                         [4, 5, 6, 7],
-//                         [0, 1, 2, 3]]
-//                        [[20, 21, 22, 23],
-//                         [16, 17, 18, 19],
-//                         [12, 13, 14, 15]]]]
-// ```
-//
-// Arguments:
-//	tensor: Up to 8-D.
-//	axis: 1-D. The indices of the dimensions to reverse. Must be in the range
-// `[-rank(tensor), rank(tensor))`.
-//
-// Returns The same shape as `tensor`.
-func ReverseV2(scope *Scope, tensor tf.Output, axis tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReverseV2",
-		Input: []tf.Input{
-			tensor, axis,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Returns the diagonal part of the tensor.
 //
 // This operation returns a tensor with the `diagonal` part
@@ -3262,36 +3318,20 @@ func DiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
 	return op.Output(0)
 }
 
-// Returns a diagonal tensor with a given diagonal values.
-//
-// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-// everything else padded with zeros. The diagonal is computed as follows:
-//
-// Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
-// rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
-//
-// `output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
-//
-// For example:
-//
-// ```
-// # 'diagonal' is [1, 2, 3, 4]
-// tf.diag(diagonal) ==> [[1, 0, 0, 0]
-//                        [0, 2, 0, 0]
-//                        [0, 0, 3, 0]
-//                        [0, 0, 0, 4]]
-// ```
+// Returns a tensor of ones with the same shape and type as x.
 //
 // Arguments:
-//	diagonal: Rank k tensor where k is at most 1.
-func Diag(scope *Scope, diagonal tf.Output) (output tf.Output) {
+//	x: a tensor of type T.
+//
+// Returns a tensor of the same shape and type as x but filled with ones.
+func OnesLike(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Diag",
+		Type: "OnesLike",
 		Input: []tf.Input{
-			diagonal,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -3318,6 +3358,28 @@ func ZerosLike(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// Gives a guarantee to the TF runtime that the input tensor is a constant.
+//
+// The runtime is then free to make optimizations based on this.
+//
+// Only accepts value typed tensors as inputs and rejects resource variable handles
+// as input.
+//
+// Returns the input tensor without modification.
+func GuaranteeConst(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GuaranteeConst",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Splits a tensor into `num_split` tensors along one dimension.
 //
 // Arguments:
@@ -3357,25 +3419,242 @@ func SplitV(scope *Scope, value tf.Output, size_splits tf.Output, axis tf.Output
 	return output
 }
 
-// Concatenates tensors along one dimension.
+// Splits a tensor into `num_split` tensors along one dimension.
 //
 // Arguments:
-//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [-rank(values), rank(values)).
+//	axis: 0-D.  The dimension along which to split.  Must be in the range
+// `[-rank(value), rank(value))`.
+//	value: The tensor to split.
+//	num_split: The number of ways to split.  Must evenly divide
+// `value.shape[split_dim]`.
 //
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.
-func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
+// Returns They are identically shaped tensors, whose shape matches that of `value`
+// except along `axis`, where their sizes are
+// `values.shape[split_dim] / num_split`.
+func Split(scope *Scope, axis tf.Output, value tf.Output, num_split int64) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_split": num_split}
+	opspec := tf.OpSpec{
+		Type: "Split",
+		Input: []tf.Input{
+			axis, value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("Split", err)
+		return
+	}
+	return output
+}
+
+// Broadcast an array for a compatible shape.
+//
+// Broadcasting is the process of making arrays to have compatible shapes
+// for arithmetic operations. Two shapes are compatible if for each
+// dimension pair they are either equal or one of them is one. When trying
+// to broadcast a Tensor to a shape, it starts with the trailing dimensions,
+// and works its way forward.
+//
+// For example,
+//
+// ```python
+// >>> x = tf.constant([1, 2, 3])
+// >>> y = tf.broadcast_to(x, [3, 3])
+// >>> sess.run(y)
+// array([[1, 2, 3],
+//        [1, 2, 3],
+//        [1, 2, 3]], dtype=int32)
+// ```
+//
+// In the above example, the input Tensor with the shape of `[1, 3]`
+// is broadcasted to output Tensor with shape of `[3, 3]`.
+//
+// Arguments:
+//	input: A Tensor to broadcast.
+//	shape: An 1-D `int` Tensor. The shape of the desired output.
+//
+// Returns A Tensor.
+func BroadcastTo(scope *Scope, input tf.Output, shape tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ConcatV2",
+		Type: "BroadcastTo",
 		Input: []tf.Input{
-			tf.OutputList(values), axis,
+			input, shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Converts a flat index or array of flat indices into a tuple of
+//
+// coordinate arrays.
+//
+// @compatibility(numpy)
+// Equivalent to np.unravel_index
+// @end_compatibility
+//
+// Arguments:
+//	indices: An 0-D or 1-D `int` Tensor whose elements are indices into the
+// flattened version of an array of dimensions dims.
+//	dims: An 1-D `int` Tensor. The shape of the array to use for unraveling
+// indices.
+//
+// Returns An 2-D (or 1-D if indices is 0-D) tensor where each row has the
+// same shape as the indices array.
+func UnravelIndex(scope *Scope, indices tf.Output, dims tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnravelIndex",
+		Input: []tf.Input{
+			indices, dims,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// UnpackAttr is an optional argument to Unpack.
+type UnpackAttr func(optionalAttr)
+
+// UnpackAxis sets the optional axis attribute to value.
+//
+// value: Dimension along which to unpack.  Negative values wrap around, so the
+// valid range is `[-R, R)`.
+// If not specified, defaults to 0
+func UnpackAxis(value int64) UnpackAttr {
+	return func(m optionalAttr) {
+		m["axis"] = value
+	}
+}
+
+// Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
+//
+// Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
+// For example, given a tensor of shape `(A, B, C, D)`;
+//
+// If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
+//   and each tensor in `output` will have shape `(B, C, D)`. (Note that the
+//   dimension unpacked along is gone, unlike `split`).
+//
+// If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
+//   and each tensor in `output` will have shape `(A, C, D)`.
+// Etc.
+//
+// This is the opposite of `pack`.
+//
+// Arguments:
+//	value: 1-D or higher, with `axis` dimension size equal to `num`.
+//
+//
+// Returns The list of tensors unpacked from `value`.
+func Unpack(scope *Scope, value tf.Output, num int64, optional ...UnpackAttr) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num": num}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Unpack",
+		Input: []tf.Input{
+			value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("Unpack", err)
+		return
+	}
+	return output
+}
+
+//     Subtracts `v` into specified rows of `x`.
+//
+//     Computes y = x; y[i, :] -= v; return y.
+//
+// Arguments:
+//	x: A `Tensor` of type T.
+//	i: A vector. Indices into the left-most dimension of `x`.
+//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
+//
+// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
+func InplaceSub(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InplaceSub",
+		Input: []tf.Input{
+			x, i, v,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+//     Adds v into specified rows of x.
+//
+//     Computes y = x; y[i, :] += v; return y.
+//
+// Arguments:
+//	x: A `Tensor` of type T.
+//	i: A vector. Indices into the left-most dimension of `x`.
+//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
+//
+// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
+func InplaceAdd(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InplaceAdd",
+		Input: []tf.Input{
+			x, i, v,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+//     Updates specified rows with values in `v`.
+//
+//     Computes `x[i, :] = v; return x`.
+//
+// Arguments:
+//	x: A tensor of type `T`.
+//	i: A vector. Indices into the left-most dimension of `x`.
+//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
+//
+// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
+func InplaceUpdate(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InplaceUpdate",
+		Input: []tf.Input{
+			x, i, v,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -3403,333 +3682,29 @@ func DeepCopy(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Concatenates a list of `N` tensors along the first dimension.
+// Encode audio data using the WAV file format.
 //
-// The input tensors are all required to have size 1 in the first dimension.
+// This operation will generate a string suitable to be saved out to create a .wav
+// audio file. It will be encoded in the 16-bit PCM format. It takes in float
+// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
+// that range.
 //
-// For example:
-//
-// ```
-// # 'x' is [[1, 4]]
-// # 'y' is [[2, 5]]
-// # 'z' is [[3, 6]]
-// parallel_concat([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
-// ```
-//
-// The difference between concat and parallel_concat is that concat requires all
-// of the inputs be computed before the operation will begin but doesn't require
-// that the input shapes be known during graph construction.  Parallel concat
-// will copy pieces of the input into the output as they become available, in
-// some situations this can provide a performance benefit.
+// `audio` is a 2-D float Tensor of shape `[length, channels]`.
+// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
 //
 // Arguments:
-//	values: Tensors to be concatenated. All must have size 1 in the first dimension
-// and same shape.
-//	shape: the final shape of the result; should be equal to the shapes of any input
-// but with the number of input values in the first dimension.
+//	audio: 2-D with shape `[length, channels]`.
+//	sample_rate: Scalar containing the sample frequency.
 //
-// Returns The concatenated tensor.
-func ParallelConcat(scope *Scope, values []tf.Output, shape tf.Shape) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shape": shape}
-	opspec := tf.OpSpec{
-		Type: "ParallelConcat",
-		Input: []tf.Input{
-			tf.OutputList(values),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AudioSpectrogramAttr is an optional argument to AudioSpectrogram.
-type AudioSpectrogramAttr func(optionalAttr)
-
-// AudioSpectrogramMagnitudeSquared sets the optional magnitude_squared attribute to value.
-//
-// value: Whether to return the squared magnitude or just the
-// magnitude. Using squared magnitude can avoid extra calculations.
-// If not specified, defaults to false
-func AudioSpectrogramMagnitudeSquared(value bool) AudioSpectrogramAttr {
-	return func(m optionalAttr) {
-		m["magnitude_squared"] = value
-	}
-}
-
-// Produces a visualization of audio data over time.
-//
-// Spectrograms are a standard way of representing audio information as a series of
-// slices of frequency information, one slice for each window of time. By joining
-// these together into a sequence, they form a distinctive fingerprint of the sound
-// over time.
-//
-// This op expects to receive audio data as an input, stored as floats in the range
-// -1 to 1, together with a window width in samples, and a stride specifying how
-// far to move the window between slices. From this it generates a three
-// dimensional output. The first dimension is for the channels in the input, so a
-// stereo audio input would have two here for example. The second dimension is time,
-// with successive frequency slices. The third dimension has an amplitude value for
-// each frequency during that time slice.
-//
-// This means the layout when converted and saved as an image is rotated 90 degrees
-// clockwise from a typical spectrogram. Time is descending down the Y axis, and
-// the frequency decreases from left to right.
-//
-// Each value in the result represents the square root of the sum of the real and
-// imaginary parts of an FFT on the current window of samples. In this way, the
-// lowest dimension represents the power of each frequency in the current window,
-// and adjacent windows are concatenated in the next dimension.
-//
-// To get a more intuitive and visual look at what this operation does, you can run
-// tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the
-// resulting spectrogram as a PNG image.
-//
-// Arguments:
-//	input: Float representation of audio data.
-//	window_size: How wide the input window is in samples. For the highest efficiency
-// this should be a power of two, but other values are accepted.
-//	stride: How widely apart the center of adjacent sample windows should be.
-//
-// Returns 3D representation of the audio frequencies as an image.
-func AudioSpectrogram(scope *Scope, input tf.Output, window_size int64, stride int64, optional ...AudioSpectrogramAttr) (spectrogram tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"window_size": window_size, "stride": stride}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AudioSpectrogram",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UnbatchAttr is an optional argument to Unbatch.
-type UnbatchAttr func(optionalAttr)
-
-// UnbatchContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func UnbatchContainer(value string) UnbatchAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// UnbatchSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func UnbatchSharedName(value string) UnbatchAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Reverses the operation of Batch for a single output Tensor.
-//
-// An instance of Unbatch either receives an empty batched_tensor, in which case it
-// asynchronously waits until the values become available from a concurrently
-// running instance of Unbatch with the same container and shared_name, or receives
-// a non-empty batched_tensor in which case it finalizes all other concurrently
-// running instances and outputs its own element from the batch.
-//
-// batched_tensor: The possibly transformed output of Batch. The size of the first
-//  dimension should remain unchanged by the transformations for the operation to
-//  work.
-// batch_index: The matching batch_index obtained from Batch.
-// id: The id scalar emitted by Batch.
-// unbatched_tensor: The Tensor corresponding to this execution.
-// timeout_micros: Maximum amount of time (in microseconds) to wait to receive the
-//  batched input tensor associated with a given invocation of the op.
-// container: Container to control resource sharing.
-// shared_name: Instances of Unbatch with the same container and shared_name are
-//  assumed to possibly belong to the same batch. If left empty, the op name will
-//  be used as the shared name.
-func Unbatch(scope *Scope, batched_tensor tf.Output, batch_index tf.Output, id tf.Output, timeout_micros int64, optional ...UnbatchAttr) (unbatched_tensor tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"timeout_micros": timeout_micros}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Unbatch",
-		Input: []tf.Input{
-			batched_tensor, batch_index, id,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// BatchAttr is an optional argument to Batch.
-type BatchAttr func(optionalAttr)
-
-// BatchMaxEnqueuedBatches sets the optional max_enqueued_batches attribute to value.
-// If not specified, defaults to 10
-func BatchMaxEnqueuedBatches(value int64) BatchAttr {
-	return func(m optionalAttr) {
-		m["max_enqueued_batches"] = value
-	}
-}
-
-// BatchAllowedBatchSizes sets the optional allowed_batch_sizes attribute to value.
-// If not specified, defaults to <>
-func BatchAllowedBatchSizes(value []int64) BatchAttr {
-	return func(m optionalAttr) {
-		m["allowed_batch_sizes"] = value
-	}
-}
-
-// BatchContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func BatchContainer(value string) BatchAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// BatchSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func BatchSharedName(value string) BatchAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// BatchBatchingQueue sets the optional batching_queue attribute to value.
-// If not specified, defaults to ""
-func BatchBatchingQueue(value string) BatchAttr {
-	return func(m optionalAttr) {
-		m["batching_queue"] = value
-	}
-}
-
-// Batches all input tensors nondeterministically.
-//
-// When many instances of this Op are being run concurrently with the same
-// container/shared_name in the same device, some will output zero-shaped Tensors
-// and others will output Tensors of size up to max_batch_size.
-//
-// All Tensors in in_tensors are batched together (so, for example, labels and
-// features should be batched with a single instance of this operation.
-//
-// Each invocation of batch emits an `id` scalar which will be used to identify
-// this particular invocation when doing unbatch or its gradient.
-//
-// Each op which emits a non-empty batch will also emit a non-empty batch_index
-// Tensor, which, is a [K, 3] matrix where each row contains the invocation's id,
-// start, and length of elements of each set of Tensors present in batched_tensors.
-//
-// Batched tensors are concatenated along the first dimension, and all tensors in
-// in_tensors must have the first dimension of the same size.
-//
-// in_tensors: The tensors to be batched.
-// num_batch_threads: Number of scheduling threads for processing batches of work.
-//  Determines the number of batches processed in parallel.
-// max_batch_size: Batch sizes will never be bigger than this.
-// batch_timeout_micros: Maximum number of microseconds to wait before outputting
-//  an incomplete batch.
-// allowed_batch_sizes: Optional list of allowed batch sizes. If left empty, does
-//  nothing. Otherwise, supplies a list of batch sizes, causing the op to pad
-//  batches up to one of those sizes. The entries must increase monotonically, and
-//  the final entry must equal max_batch_size.
-// grad_timeout_micros: The timeout to use for the gradient. See Unbatch.
-// batched_tensors: Either empty tensors or a batch of concatenated Tensors.
-// batch_index: If out_tensors is non-empty, has information to invert it.
-// container: Controls the scope of sharing of this batch.
-// id: always contains a scalar with a unique ID for this invocation of Batch.
-// shared_name: Concurrently running instances of batch in the same device with the
-//  same container and shared_name will batch their elements together. If left
-//  empty, the op name will be used as the shared name.
-// T: the types of tensors to be batched.
-func Batch(scope *Scope, in_tensors []tf.Output, num_batch_threads int64, max_batch_size int64, batch_timeout_micros int64, grad_timeout_micros int64, optional ...BatchAttr) (batched_tensors []tf.Output, batch_index tf.Output, id tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_batch_threads": num_batch_threads, "max_batch_size": max_batch_size, "batch_timeout_micros": batch_timeout_micros, "grad_timeout_micros": grad_timeout_micros}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Batch",
-		Input: []tf.Input{
-			tf.OutputList(in_tensors),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if batched_tensors, idx, err = makeOutputList(op, idx, "batched_tensors"); err != nil {
-		scope.UpdateErr("Batch", err)
-		return
-	}
-	batch_index = op.Output(idx)
-	id = op.Output(idx)
-	return batched_tensors, batch_index, id
-}
-
-// Elementwise computes the bitwise left-shift of `x` and `y`.
-//
-// If `y` is negative, or greater than or equal to the width of `x` in bits the
-// result is implementation defined.
-func LeftShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Returns 0-D. WAV-encoded file contents.
+func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LeftShift",
+		Type: "EncodeWav",
 		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Elementwise computes the bitwise XOR of `x` and `y`.
-//
-// The result will have those bits set, that are different in `x` and `y`. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseXor(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BitwiseXor",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Elementwise computes the bitwise OR of `x` and `y`.
-//
-// The result will have those bits set, that are set in `x`, `y` or both. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BitwiseOr",
-		Input: []tf.Input{
-			x, y,
+			audio, sample_rate,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -3799,6 +3774,103 @@ func DecodeWav(scope *Scope, contents tf.Output, optional ...DecodeWavAttr) (aud
 	return op.Output(0), op.Output(1)
 }
 
+// UnbatchAttr is an optional argument to Unbatch.
+type UnbatchAttr func(optionalAttr)
+
+// UnbatchContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func UnbatchContainer(value string) UnbatchAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// UnbatchSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func UnbatchSharedName(value string) UnbatchAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Reverses the operation of Batch for a single output Tensor.
+//
+// An instance of Unbatch either receives an empty batched_tensor, in which case it
+// asynchronously waits until the values become available from a concurrently
+// running instance of Unbatch with the same container and shared_name, or receives
+// a non-empty batched_tensor in which case it finalizes all other concurrently
+// running instances and outputs its own element from the batch.
+//
+// batched_tensor: The possibly transformed output of Batch. The size of the first
+//  dimension should remain unchanged by the transformations for the operation to
+//  work.
+// batch_index: The matching batch_index obtained from Batch.
+// id: The id scalar emitted by Batch.
+// unbatched_tensor: The Tensor corresponding to this execution.
+// timeout_micros: Maximum amount of time (in microseconds) to wait to receive the
+//  batched input tensor associated with a given invocation of the op.
+// container: Container to control resource sharing.
+// shared_name: Instances of Unbatch with the same container and shared_name are
+//  assumed to possibly belong to the same batch. If left empty, the op name will
+//  be used as the shared name.
+func Unbatch(scope *Scope, batched_tensor tf.Output, batch_index tf.Output, id tf.Output, timeout_micros int64, optional ...UnbatchAttr) (unbatched_tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"timeout_micros": timeout_micros}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Unbatch",
+		Input: []tf.Input{
+			batched_tensor, batch_index, id,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Elementwise computes the bitwise right-shift of `x` and `y`.
+//
+// Performs a logical shift for unsigned integer types, and an arithmetic shift
+// for signed integer types.
+//
+// If `y` is negative, or greater than or equal to than the width of `x` in bits
+// the result is implementation defined.
+func RightShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RightShift",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Elementwise computes the bitwise left-shift of `x` and `y`.
+//
+// If `y` is negative, or greater than or equal to the width of `x` in bits the
+// result is implementation defined.
+func LeftShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LeftShift",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Elementwise computes the bitwise AND of `x` and `y`.
 //
 // The result will have those bits set, that are set in both `x` and `y`. The
@@ -3817,125 +3889,34 @@ func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// Computes element-wise population count (a.k.a. popcount, bitsum, bitcount).
-//
-// For each entry in `x`, calculates the number of `1` (on) bits in the binary
-// representation of that entry.
-//
-// **NOTE**: It is more efficient to first `tf.bitcast` your tensors into
-// `int32` or `int64` and perform the bitcount on the result, than to feed in
-// 8- or 16-bit inputs and then aggregate the resulting counts.
-func PopulationCount(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "PopulationCount",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// BoostedTreesQuantileStreamResourceFlushAttr is an optional argument to BoostedTreesQuantileStreamResourceFlush.
+type BoostedTreesQuantileStreamResourceFlushAttr func(optionalAttr)
 
-// Flips all bits elementwise.
+// BoostedTreesQuantileStreamResourceFlushGenerateQuantiles sets the optional generate_quantiles attribute to value.
 //
-// The result will have exactly those bits set, that are not set in `x`. The
-// computation is performed on the underlying representation of x.
-func Invert(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Invert",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Generate the bucket boundaries for each feature based on accumulated summaries.
-//
-// An op that returns a list of float tensors for a quantile stream resource. Each
-// tensor is Rank 1 containing bucket boundaries for a single feature.
-//
-// Arguments:
-//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
-//	num_features: inferred int; number of features to get bucket boundaries for.
-//
-// Returns float; List of Rank 1 Tensors each containing the bucket boundaries for a feature.
-func BoostedTreesQuantileStreamResourceGetBucketBoundaries(scope *Scope, quantile_stream_resource_handle tf.Output, num_features int64) (bucket_boundaries []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_features": num_features}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesQuantileStreamResourceGetBucketBoundaries",
-		Input: []tf.Input{
-			quantile_stream_resource_handle,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if bucket_boundaries, idx, err = makeOutputList(op, idx, "bucket_boundaries"); err != nil {
-		scope.UpdateErr("BoostedTreesQuantileStreamResourceGetBucketBoundaries", err)
-		return
-	}
-	return bucket_boundaries
-}
-
-// Deserialize bucket boundaries and ready flag into current QuantileAccumulator.
-//
-// An op that deserializes bucket boundaries and are boundaries ready flag into current QuantileAccumulator.
-//
-// Arguments:
-//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
-//	bucket_boundaries: float; List of Rank 1 Tensors each containing the bucket boundaries for a feature.
-//
-// Returns the created operation.
-func BoostedTreesQuantileStreamResourceDeserialize(scope *Scope, quantile_stream_resource_handle tf.Output, bucket_boundaries []tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesQuantileStreamResourceDeserialize",
-		Input: []tf.Input{
-			quantile_stream_resource_handle, tf.OutputList(bucket_boundaries),
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// BoostedTreesCreateQuantileStreamResourceAttr is an optional argument to BoostedTreesCreateQuantileStreamResource.
-type BoostedTreesCreateQuantileStreamResourceAttr func(optionalAttr)
-
-// BoostedTreesCreateQuantileStreamResourceMaxElements sets the optional max_elements attribute to value.
-//
-// value: int; The maximum number of data points that can be fed to the stream.
-// If not specified, defaults to 1099511627776
-func BoostedTreesCreateQuantileStreamResourceMaxElements(value int64) BoostedTreesCreateQuantileStreamResourceAttr {
+// value: bool; If True, the output will be the num_quantiles for each stream where the ith
+// entry is the ith quantile of the input with an approximation error of epsilon.
+// Duplicate values may be present.
+// If False, the output will be the points in the histogram that we got which roughly
+// translates to 1/epsilon boundaries and without any duplicates.
+// Default to False.
+// If not specified, defaults to false
+func BoostedTreesQuantileStreamResourceFlushGenerateQuantiles(value bool) BoostedTreesQuantileStreamResourceFlushAttr {
 	return func(m optionalAttr) {
-		m["max_elements"] = value
+		m["generate_quantiles"] = value
 	}
 }
 
-// Create the Resource for Quantile Streams.
+// Flush the summaries for a quantile stream resource.
+//
+// An op that flushes the summaries for a quantile stream resource.
 //
 // Arguments:
-//	quantile_stream_resource_handle: resource; Handle to quantile stream resource.
-//	epsilon: float; The required approximation error of the stream resource.
-//	num_streams: int; The number of streams managed by the resource that shares the same epsilon.
+//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
+//	num_buckets: int; approximate number of buckets unless using generate_quantiles.
 //
 // Returns the created operation.
-func BoostedTreesCreateQuantileStreamResource(scope *Scope, quantile_stream_resource_handle tf.Output, epsilon tf.Output, num_streams tf.Output, optional ...BoostedTreesCreateQuantileStreamResourceAttr) (o *tf.Operation) {
+func BoostedTreesQuantileStreamResourceFlush(scope *Scope, quantile_stream_resource_handle tf.Output, num_buckets tf.Output, optional ...BoostedTreesQuantileStreamResourceFlushAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -3944,15 +3925,39 @@ func BoostedTreesCreateQuantileStreamResource(scope *Scope, quantile_stream_reso
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesCreateQuantileStreamResource",
+		Type: "BoostedTreesQuantileStreamResourceFlush",
 		Input: []tf.Input{
-			quantile_stream_resource_handle, epsilon, num_streams,
+			quantile_stream_resource_handle, num_buckets,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
+// Add the quantile summaries to each quantile stream resource.
+//
+// An op that adds a list of quantile summaries to a quantile stream resource. Each
+// summary Tensor is rank 2, containing summaries (value, weight, min_rank, max_rank)
+// for a single feature.
+//
+// Arguments:
+//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
+//	summaries: string; List of Rank 2 Tensor each containing the summaries for a single feature.
+//
+// Returns the created operation.
+func BoostedTreesQuantileStreamResourceAddSummaries(scope *Scope, quantile_stream_resource_handle tf.Output, summaries []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesQuantileStreamResourceAddSummaries",
+		Input: []tf.Input{
+			quantile_stream_resource_handle, tf.OutputList(summaries),
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Checks whether a quantile stream has been initialized.
 //
 // An Op that checks if quantile stream resource is initialized.
@@ -4036,101 +4041,6 @@ func BoostedTreesCenterBias(scope *Scope, tree_ensemble_handle tf.Output, mean_g
 	return op.Output(0)
 }
 
-// Updates the tree ensemble by either adding a layer to the last tree being grown
-//
-// or by starting a new tree.
-//
-// Arguments:
-//	tree_ensemble_handle: Handle to the ensemble variable.
-//	feature_ids: Rank 1 tensor with ids for each feature. This is the real id of
-// the feature that will be used in the split.
-//	node_ids: List of rank 1 tensors representing the nodes for which this feature
-// has a split.
-//	gains: List of rank 1 tensors representing the gains for each of the feature's
-// split.
-//	thresholds: List of rank 1 tensors representing the thesholds for each of the
-// feature's split.
-//	left_node_contribs: List of rank 2 tensors with left leaf contribs for each of
-// the feature's splits. Will be added to the previous node values to constitute
-// the values of the left nodes.
-//	right_node_contribs: List of rank 2 tensors with right leaf contribs for each
-// of the feature's splits. Will be added to the previous node values to constitute
-// the values of the right nodes.
-//	max_depth: Max depth of the tree to build.
-//	learning_rate: shrinkage const for each new tree.
-//	pruning_mode: 0-No pruning, 1-Pre-pruning, 2-Post-pruning.
-//
-// Returns the created operation.
-func BoostedTreesUpdateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, feature_ids tf.Output, node_ids []tf.Output, gains []tf.Output, thresholds []tf.Output, left_node_contribs []tf.Output, right_node_contribs []tf.Output, max_depth tf.Output, learning_rate tf.Output, pruning_mode int64) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"pruning_mode": pruning_mode}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesUpdateEnsemble",
-		Input: []tf.Input{
-			tree_ensemble_handle, feature_ids, tf.OutputList(node_ids), tf.OutputList(gains), tf.OutputList(thresholds), tf.OutputList(left_node_contribs), tf.OutputList(right_node_contribs), max_depth, learning_rate,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Runs multiple additive regression ensemble predictors on input instances and
-//
-// computes the update to cached logits. It is designed to be used during training.
-// It traverses the trees starting from cached tree id and cached node id and
-// calculates the updates to be pushed to the cache.
-//
-// Arguments:
-//
-//	cached_tree_ids: Rank 1 Tensor containing cached tree ids which is the starting
-// tree of prediction.
-//	cached_node_ids: Rank 1 Tensor containing cached node id which is the starting
-// node of prediction.
-//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
-// feature.
-//	logits_dimension: scalar, dimension of the logits, to be used for partial logits
-// shape.
-//
-// Returns Rank 2 Tensor containing logits update (with respect to cached
-// values stored) for each example.Rank 1 Tensor containing new tree ids for each example.Rank 1 Tensor containing new node ids in the new tree_ids.
-func BoostedTreesTrainingPredict(scope *Scope, tree_ensemble_handle tf.Output, cached_tree_ids tf.Output, cached_node_ids tf.Output, bucketized_features []tf.Output, logits_dimension int64) (partial_logits tf.Output, tree_ids tf.Output, node_ids tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesTrainingPredict",
-		Input: []tf.Input{
-			tree_ensemble_handle, cached_tree_ids, cached_node_ids, tf.OutputList(bucketized_features),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Serializes the tree ensemble to a proto.
-//
-// Arguments:
-//	tree_ensemble_handle: Handle to the tree ensemble.
-//
-// Returns Stamp token of the tree ensemble resource.Serialized proto of the ensemble.
-func BoostedTreesSerializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, tree_ensemble_serialized tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesSerializeEnsemble",
-		Input: []tf.Input{
-			tree_ensemble_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
 // Runs multiple additive regression ensemble predictors on input instances and
 //
 // computes the logits. It is designed to be used during prediction.
@@ -4219,6 +4129,48 @@ func BoostedTreesMakeStatsSummary(scope *Scope, node_ids tf.Output, gradients tf
 	return op.Output(0)
 }
 
+// Retrieves the tree ensemble resource stamp token, number of trees and growing statistics.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble.
+//
+// Returns Stamp token of the tree ensemble resource.The number of trees in the tree ensemble resource.The number of trees that were finished successfully.The number of layers we attempted to build (but not necessarily succeeded).Rank size 2 tensor that contains start and end ids of the nodes in the latest
+// layer.
+func BoostedTreesGetEnsembleStates(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, num_trees tf.Output, num_finalized_trees tf.Output, num_attempted_layers tf.Output, last_layer_nodes_range tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesGetEnsembleStates",
+		Input: []tf.Input{
+			tree_ensemble_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// Creates a tree ensemble model and returns a handle to it.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble resource to be created.
+//	stamp_token: Token to use as the initial value of the resource stamp.
+//	tree_ensemble_serialized: Serialized proto of the tree ensemble.
+//
+// Returns the created operation.
+func BoostedTreesCreateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesCreateEnsemble",
+		Input: []tf.Input{
+			tree_ensemble_handle, stamp_token, tree_ensemble_serialized,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // BoostedTreesCalculateBestFeatureSplitAttr is an optional argument to BoostedTreesCalculateBestFeatureSplit.
 type BoostedTreesCalculateBestFeatureSplitAttr func(optionalAttr)
 
@@ -4272,6 +4224,68 @@ func BoostedTreesCalculateBestFeatureSplit(scope *Scope, node_id_range tf.Output
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
 }
 
+// Calculates gains for each feature and returns the best possible split information for the feature.
+//
+// The split information is the best threshold (bucket id), gains and left/right node contributions per node for each feature.
+//
+// It is possible that not all nodes can be split on each feature. Hence, the list of possible nodes can differ between the features. Therefore, we return `node_ids_list` for each feature, containing the list of nodes that this feature can be used to split.
+//
+// In this manner, the output is the best split per features and per node, so that it needs to be combined later to produce the best split for each node (among all possible features).
+//
+// The length of output lists are all of the same length, `num_features`.
+// The output shapes are compatible in a way that the first dimension of all tensors of all lists are the same and equal to the number of possible split nodes for each feature.
+//
+// Arguments:
+//	node_id_range: A Rank 1 tensor (shape=[2]) to specify the range [first, last) of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1])` (Note that the last index node_id_range[1] is exclusive).
+//	stats_summary_list: A list of Rank 3 tensor (#shape=[max_splits, bucket, 2]) for accumulated stats summary (gradient/hessian) per node per buckets for each feature. The first dimension of the tensor is the maximum number of splits, and thus not all elements of it will be used, but only the indexes specified by node_ids will be used.
+//	l1: l1 regularization factor on leaf weights, per instance based.
+//	l2: l2 regularization factor on leaf weights, per instance based.
+//	tree_complexity: adjustment to the gain, per leaf based.
+//	min_node_weight: mininum avg of hessians in a node before required for the node to be considered for splitting.
+//	max_splits: the number of nodes that can be split in the whole tree. Used as a dimension of output tensors.
+//
+// Returns An output list of Rank 1 tensors indicating possible split node ids for each feature. The length of the list is num_features, but each tensor has different size as each feature provides different possible nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the best gains for each feature to split for certain nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the bucket id to compare with (as a threshold) for split in each node. See above for details like shapes and sizes.A list of Rank 2 tensors indicating the contribution of the left nodes when branching from parent nodes (given by the tensor element in the output node_ids_list) to the left direction by the given threshold for each feature. This value will be used to make the left node value by adding to the parent node value. Second dimension size is 1 for 1-dimensional logits, but would be larger for multi-class problems. See above for details like shapes and sizes.A list of Rank 2 tensors, with the same shape/conditions as left_node_contribs_list, but just that the value is for the right node.
+func BoostedTreesCalculateBestGainsPerFeature(scope *Scope, node_id_range tf.Output, stats_summary_list []tf.Output, l1 tf.Output, l2 tf.Output, tree_complexity tf.Output, min_node_weight tf.Output, max_splits int64) (node_ids_list []tf.Output, gains_list []tf.Output, thresholds_list []tf.Output, left_node_contribs_list []tf.Output, right_node_contribs_list []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"max_splits": max_splits}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesCalculateBestGainsPerFeature",
+		Input: []tf.Input{
+			node_id_range, tf.OutputList(stats_summary_list), l1, l2, tree_complexity, min_node_weight,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if node_ids_list, idx, err = makeOutputList(op, idx, "node_ids_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if gains_list, idx, err = makeOutputList(op, idx, "gains_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if thresholds_list, idx, err = makeOutputList(op, idx, "thresholds_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if left_node_contribs_list, idx, err = makeOutputList(op, idx, "left_node_contribs_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if right_node_contribs_list, idx, err = makeOutputList(op, idx, "right_node_contribs_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	return node_ids_list, gains_list, thresholds_list, left_node_contribs_list, right_node_contribs_list
+}
+
 // Checks whether a tree ensemble has been initialized.
 //
 // Arguments:
@@ -4393,24 +4407,44 @@ func TensorForestTreeDeserialize(scope *Scope, tree_handle tf.Output, tree_confi
 	return scope.AddOperation(opspec)
 }
 
-// Creates a tree resource and returns a handle to it.
+// Serializes the tree handle to a proto
 //
 // Arguments:
-//	tree_handle: Handle to the tree resource to be created.
-//	tree_config: Serialized proto string of the boosted_trees.Tree.
+//	tree_handle: Handle to the tree resource to be serialized.
 //
-// Returns the created operation.
-func TensorForestCreateTreeVariable(scope *Scope, tree_handle tf.Output, tree_config tf.Output) (o *tf.Operation) {
+// Returns Serialied proto string of the tree resource.
+func TensorForestTreeSerialize(scope *Scope, tree_handle tf.Output) (tree_config tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorForestCreateTreeVariable",
+		Type: "TensorForestTreeSerialize",
 		Input: []tf.Input{
-			tree_handle, tree_config,
+			tree_handle,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Checks whether a tree has been initialized.
+//
+// Arguments:
+//	tree_handle: Handle to the tree.
+//
+// Returns Whether the tree is initialized.
+func TensorForestTreeIsInitializedOp(scope *Scope, tree_handle tf.Output) (is_initialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorForestTreeIsInitializedOp",
+		Input: []tf.Input{
+			tree_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
 // TensorForestTreeResourceHandleOpAttr is an optional argument to TensorForestTreeResourceHandleOp.
@@ -4450,6 +4484,184 @@ func TensorForestTreeResourceHandleOp(scope *Scope, optional ...TensorForestTree
 	return op.Output(0)
 }
 
+// ComputeAccidentalHitsAttr is an optional argument to ComputeAccidentalHits.
+type ComputeAccidentalHitsAttr func(optionalAttr)
+
+// ComputeAccidentalHitsSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ComputeAccidentalHitsSeed(value int64) ComputeAccidentalHitsAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// ComputeAccidentalHitsSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ComputeAccidentalHitsSeed2(value int64) ComputeAccidentalHitsAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Computes the ids of the positions in sampled_candidates that match true_labels.
+//
+// When doing log-odds NCE, the result of this op should be passed through a
+// SparseToDense op, then added to the logits of the sampled candidates. This has
+// the effect of 'removing' the sampled labels that match the true labels by
+// making the classifier sure that they are sampled labels.
+//
+// Arguments:
+//	true_classes: The true_classes output of UnpackSparseLabels.
+//	sampled_candidates: The sampled_candidates output of CandidateSampler.
+//	num_true: Number of true labels per context.
+//
+// Returns A vector of indices corresponding to rows of true_candidates.A vector of IDs of positions in sampled_candidates that match a true_label
+// for the row with the corresponding index in indices.A vector of the same length as indices and ids, in which each element
+// is -FLOAT_MAX.
+func ComputeAccidentalHits(scope *Scope, true_classes tf.Output, sampled_candidates tf.Output, num_true int64, optional ...ComputeAccidentalHitsAttr) (indices tf.Output, ids tf.Output, weights tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ComputeAccidentalHits",
+		Input: []tf.Input{
+			true_classes, sampled_candidates,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// PreventGradientAttr is an optional argument to PreventGradient.
+type PreventGradientAttr func(optionalAttr)
+
+// PreventGradientMessage sets the optional message attribute to value.
+//
+// value: Will be printed in the error when anyone tries to differentiate
+// this operation.
+// If not specified, defaults to ""
+func PreventGradientMessage(value string) PreventGradientAttr {
+	return func(m optionalAttr) {
+		m["message"] = value
+	}
+}
+
+// An identity op that triggers an error if a gradient is requested.
+//
+// When executed in a graph, this op outputs its input tensor as-is.
+//
+// When building ops to compute gradients, the TensorFlow gradient system
+// will return an error when trying to lookup the gradient of this op,
+// because no gradient must ever be registered for this function.  This
+// op exists to prevent subtle bugs from silently returning unimplemented
+// gradients in some corner cases.
+//
+// Arguments:
+//	input: any tensor.
+//
+// Returns the same input tensor.
+func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "PreventGradient",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AllCandidateSamplerAttr is an optional argument to AllCandidateSampler.
+type AllCandidateSamplerAttr func(optionalAttr)
+
+// AllCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func AllCandidateSamplerSeed(value int64) AllCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// AllCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to produce.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AllCandidateSampler",
+		Input: []tf.Input{
+			true_classes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // LearnedUnigramCandidateSamplerAttr is an optional argument to LearnedUnigramCandidateSampler.
 type LearnedUnigramCandidateSamplerAttr func(optionalAttr)
 
@@ -4523,96 +4735,6 @@ func LearnedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_tr
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// LoadAndRemapMatrixAttr is an optional argument to LoadAndRemapMatrix.
-type LoadAndRemapMatrixAttr func(optionalAttr)
-
-// LoadAndRemapMatrixMaxRowsInMemory sets the optional max_rows_in_memory attribute to value.
-//
-// value: The maximum number of rows to load from the checkpoint at
-// once. If less than or equal to 0, the entire matrix will be loaded into
-// memory. Setting this arg trades increased disk reads for lower memory usage.
-// If not specified, defaults to -1
-func LoadAndRemapMatrixMaxRowsInMemory(value int64) LoadAndRemapMatrixAttr {
-	return func(m optionalAttr) {
-		m["max_rows_in_memory"] = value
-	}
-}
-
-// Loads a 2-D (matrix) `Tensor` with name `old_tensor_name` from the checkpoint
-//
-// at `ckpt_path` and potentially reorders its rows and columns using the
-// specified remappings.
-//
-// Most users should use one of the wrapper initializers (such as
-// `tf.contrib.framework.load_and_remap_matrix_initializer`) instead of this
-// function directly.
-//
-// The remappings are 1-D tensors with the following properties:
-//
-// * `row_remapping` must have exactly `num_rows` entries. Row `i` of the output
-//   matrix will be initialized from the row corresponding to index
-//   `row_remapping[i]` in the old `Tensor` from the checkpoint.
-// * `col_remapping` must have either 0 entries (indicating that no column
-//   reordering is needed) or `num_cols` entries. If specified, column `j` of the
-//   output matrix will be initialized from the column corresponding to index
-//   `col_remapping[j]` in the old `Tensor` from the checkpoint.
-// * A value of -1 in either of the remappings signifies a "missing" entry. In that
-//   case, values from the `initializing_values` tensor will be used to fill that
-//   missing row or column. If `row_remapping` has `r` missing entries and
-//   `col_remapping` has `c` missing entries, then the following condition must be
-//   true:
-//
-// `(r * num_cols) + (c * num_rows) - (r * c) == len(initializing_values)`
-//
-// The remapping tensors can be generated using the GenerateVocabRemapping op.
-//
-// As an example, with row_remapping = [1, 0, -1], col_remapping = [0, 2, -1],
-// initializing_values = [0.5, -0.5, 0.25, -0.25, 42], and w(i, j) representing
-// the value from row i, column j of the old tensor in the checkpoint, the output
-// matrix will look like the following:
-//
-// [[w(1, 0),  w(1, 2),  0.5],
-//  [w(0, 0),  w(0, 2), -0.5],
-//  [0.25,    -0.25,      42]]
-//
-// Arguments:
-//	ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) from
-// which the old matrix `Tensor` will be loaded.
-//	old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
-//	row_remapping: An int `Tensor` of row remappings (generally created by
-// `generate_vocab_remapping`).  Even if no row remapping is needed, this must
-// still be an index-valued Tensor (e.g. [0, 1, 2, ...]), or a shifted
-// index-valued `Tensor` (e.g. [8, 9, 10, ...], for partitioned `Variables`).
-//	col_remapping: An int `Tensor` of column remappings (generally created by
-// `generate_vocab_remapping`).  May be a size-0 `Tensor` if only row remapping
-// is to be done (e.g. column ordering is the same).
-//	initializing_values: A float `Tensor` containing  values to fill in for cells
-// in the output matrix that are not loaded from the checkpoint. Length must be
-// exactly the same as the number of missing / new cells.
-//	num_rows: Number of rows (length of the 1st dimension) in the output matrix.
-//	num_cols: Number of columns (length of the 2nd dimension) in the output matrix.
-//
-// Returns Output matrix containing existing values loaded from the
-// checkpoint, and with any missing values filled in from initializing_values.
-func LoadAndRemapMatrix(scope *Scope, ckpt_path tf.Output, old_tensor_name tf.Output, row_remapping tf.Output, col_remapping tf.Output, initializing_values tf.Output, num_rows int64, num_cols int64, optional ...LoadAndRemapMatrixAttr) (output_matrix tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_rows": num_rows, "num_cols": num_cols}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadAndRemapMatrix",
-		Input: []tf.Input{
-			ckpt_path, old_tensor_name, row_remapping, col_remapping, initializing_values,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // GenerateVocabRemappingAttr is an optional argument to GenerateVocabRemapping.
 type GenerateVocabRemappingAttr func(optionalAttr)
 
@@ -4688,33 +4810,6 @@ func GenerateVocabRemapping(scope *Scope, new_vocab_file tf.Output, old_vocab_fi
 	return op.Output(0), op.Output(1)
 }
 
-// Returns the index of a data point that should be added to the seed set.
-//
-// Entries in distances are assumed to be squared distances of candidate points to
-// the already sampled centers in the seed set. The op constructs one Markov chain
-// of the k-MC^2 algorithm and returns the index of one candidate point to be added
-// as an additional cluster center.
-//
-// Arguments:
-//	distances: Vector with squared distances to the closest previously sampled cluster center
-// for each candidate point.
-//	seed: Scalar. Seed for initializing the random number generator.
-//
-// Returns Scalar with the index of the sampled point.
-func KMC2ChainInitialization(scope *Scope, distances tf.Output, seed tf.Output) (index tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "KMC2ChainInitialization",
-		Input: []tf.Input{
-			distances, seed,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Selects num_to_sample rows of input using the KMeans++ criterion.
 //
 // Rows of points are assumed to be input points. One row is selected at random.
@@ -4746,38 +4841,6 @@ func KmeansPlusPlusInitialization(scope *Scope, points tf.Output, num_to_sample
 	return op.Output(0)
 }
 
-// Receives a tensor value broadcast from another device.
-func CollectiveBcastRecv(scope *Scope, T tf.DataType, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"T": T, "group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
-	opspec := tf.OpSpec{
-		Type: "CollectiveBcastRecv",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Broadcasts a tensor value to one or more other devices.
-func CollectiveBcastSend(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
-	opspec := tf.OpSpec{
-		Type: "CollectiveBcastSend",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // CollectiveReduceAttr is an optional argument to CollectiveReduce.
 type CollectiveReduceAttr func(optionalAttr)
 
@@ -4809,51 +4872,41 @@ func CollectiveReduce(scope *Scope, input tf.Output, group_size int64, group_key
 	return op.Output(0)
 }
 
-// AbortAttr is an optional argument to Abort.
-type AbortAttr func(optionalAttr)
-
-// AbortErrorMsg sets the optional error_msg attribute to value.
+// Does nothing. Serves as a control trigger for scheduling.
 //
-// value: A string which is the message associated with the exception.
-// If not specified, defaults to ""
-func AbortErrorMsg(value string) AbortAttr {
-	return func(m optionalAttr) {
-		m["error_msg"] = value
-	}
-}
-
-// AbortExitWithoutError sets the optional exit_without_error attribute to value.
-// If not specified, defaults to false
-func AbortExitWithoutError(value bool) AbortAttr {
-	return func(m optionalAttr) {
-		m["exit_without_error"] = value
-	}
-}
-
-// Raise a exception to abort the process when called.
-//
-// If exit_without_error is true, the process will exit normally,
-// otherwise it will exit with a SIGABORT signal.
-//
-// Returns nothing but an exception.
+// Only useful as a placeholder for control edges.
 //
 // Returns the created operation.
-func Abort(scope *Scope, optional ...AbortAttr) (o *tf.Operation) {
+func ControlTrigger(scope *Scope) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Abort",
-
-		Attrs: attrs,
+		Type: "ControlTrigger",
 	}
 	return scope.AddOperation(opspec)
 }
 
+// Makes its input available to the next iteration.
+//
+// Arguments:
+//	data: The tensor to be made available to the next iteration.
+//
+// Returns The same tensor as `data`.
+func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NextIteration",
+		Input: []tf.Input{
+			data,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Exits the current frame to its parent frame.
 //
 // Exit makes its input `data` available to the parent frame.
@@ -4876,6 +4929,61 @@ func Exit(scope *Scope, data tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
+// EnterAttr is an optional argument to Enter.
+type EnterAttr func(optionalAttr)
+
+// EnterIsConstant sets the optional is_constant attribute to value.
+//
+// value: If true, the output is constant within the child frame.
+// If not specified, defaults to false
+func EnterIsConstant(value bool) EnterAttr {
+	return func(m optionalAttr) {
+		m["is_constant"] = value
+	}
+}
+
+// EnterParallelIterations sets the optional parallel_iterations attribute to value.
+//
+// value: The number of iterations allowed to run in parallel.
+// If not specified, defaults to 10
+func EnterParallelIterations(value int64) EnterAttr {
+	return func(m optionalAttr) {
+		m["parallel_iterations"] = value
+	}
+}
+
+// Creates or finds a child frame, and makes `data` available to the child frame.
+//
+// This op is used together with `Exit` to create loops in the graph.
+// The unique `frame_name` is used by the `Executor` to identify frames. If
+// `is_constant` is true, `output` is a constant in the child frame; otherwise
+// it may be changed in the child frame. At most `parallel_iterations` iterations
+// are run in parallel in the child frame.
+//
+// Arguments:
+//	data: The tensor to be made available to the child frame.
+//	frame_name: The name of the child frame.
+//
+// Returns The same tensor as `data`.
+func Enter(scope *Scope, data tf.Output, frame_name string, optional ...EnterAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"frame_name": frame_name}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Enter",
+		Input: []tf.Input{
+			data,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Forwards the value of an available tensor from `inputs` to `output`.
 //
 // `Merge` waits for at least one of the tensors in `inputs` to become available.
@@ -4902,79 +5010,6 @@ func Merge(scope *Scope, inputs []tf.Output) (output tf.Output, value_index tf.O
 	return op.Output(0), op.Output(1)
 }
 
-// UniformCandidateSamplerAttr is an optional argument to UniformCandidateSampler.
-type UniformCandidateSamplerAttr func(optionalAttr)
-
-// UniformCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func UniformCandidateSamplerSeed(value int64) UniformCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// UniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func UniformCandidateSamplerSeed2(value int64) UniformCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Generates labels for candidate sampling with a uniform distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
-//
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
-//
-// Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
-//
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func UniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...UniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UniformCandidateSampler",
-		Input: []tf.Input{
-			true_classes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
 // Forwards `data` to the output port determined by `pred`.
 //
 // If `pred` is true, the `data` input is forwarded to `output_true`. Otherwise,
@@ -5001,6 +5036,131 @@ func Switch(scope *Scope, data tf.Output, pred tf.Output) (output_false tf.Outpu
 	return op.Output(0), op.Output(1)
 }
 
+// CTCBeamSearchDecoderAttr is an optional argument to CTCBeamSearchDecoder.
+type CTCBeamSearchDecoderAttr func(optionalAttr)
+
+// CTCBeamSearchDecoderMergeRepeated sets the optional merge_repeated attribute to value.
+//
+// value: If true, merge repeated classes in output.
+// If not specified, defaults to true
+func CTCBeamSearchDecoderMergeRepeated(value bool) CTCBeamSearchDecoderAttr {
+	return func(m optionalAttr) {
+		m["merge_repeated"] = value
+	}
+}
+
+// Performs beam search decoding on the logits given in input.
+//
+// A note about the attribute merge_repeated: For the beam search decoder,
+// this means that if consecutive entries in a beam are the same, only
+// the first of these is emitted.  That is, when the top path is "A B B B B",
+// "A B" is returned if merge_repeated = True but "A B B B B" is
+// returned if merge_repeated = False.
+//
+// Arguments:
+//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
+//	sequence_length: A vector containing sequence lengths, size `(batch)`.
+//	beam_width: A scalar >= 0 (beam search beam width).
+//	top_paths: A scalar >= 0, <= beam_width (controls output size).
+//
+// Returns A list (length: top_paths) of indices matrices.  Matrix j,
+// size `(total_decoded_outputs[j] x 2)`, has indices of a
+// `SparseTensor<int64, 2>`.  The rows store: [batch, time].A list (length: top_paths) of values vectors.  Vector j,
+// size `(length total_decoded_outputs[j])`, has the values of a
+// `SparseTensor<int64, 2>`.  The vector stores the decoded classes for beam j.A list (length: top_paths) of shape vector.  Vector j,
+// size `(2)`, stores the shape of the decoded `SparseTensor[j]`.
+// Its values are: `[batch_size, max_decoded_length[j]]`.A matrix, shaped: `(batch_size x top_paths)`.  The
+// sequence log-probabilities.
+func CTCBeamSearchDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, beam_width int64, top_paths int64, optional ...CTCBeamSearchDecoderAttr) (decoded_indices []tf.Output, decoded_values []tf.Output, decoded_shape []tf.Output, log_probability tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"beam_width": beam_width, "top_paths": top_paths}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CTCBeamSearchDecoder",
+		Input: []tf.Input{
+			inputs, sequence_length,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if decoded_indices, idx, err = makeOutputList(op, idx, "decoded_indices"); err != nil {
+		scope.UpdateErr("CTCBeamSearchDecoder", err)
+		return
+	}
+	if decoded_values, idx, err = makeOutputList(op, idx, "decoded_values"); err != nil {
+		scope.UpdateErr("CTCBeamSearchDecoder", err)
+		return
+	}
+	if decoded_shape, idx, err = makeOutputList(op, idx, "decoded_shape"); err != nil {
+		scope.UpdateErr("CTCBeamSearchDecoder", err)
+		return
+	}
+	log_probability = op.Output(idx)
+	return decoded_indices, decoded_values, decoded_shape, log_probability
+}
+
+// CTCGreedyDecoderAttr is an optional argument to CTCGreedyDecoder.
+type CTCGreedyDecoderAttr func(optionalAttr)
+
+// CTCGreedyDecoderMergeRepeated sets the optional merge_repeated attribute to value.
+//
+// value: If True, merge repeated classes in output.
+// If not specified, defaults to false
+func CTCGreedyDecoderMergeRepeated(value bool) CTCGreedyDecoderAttr {
+	return func(m optionalAttr) {
+		m["merge_repeated"] = value
+	}
+}
+
+// Performs greedy decoding on the logits given in inputs.
+//
+// A note about the attribute merge_repeated: if enabled, when
+// consecutive logits' maximum indices are the same, only the first of
+// these is emitted.  Labeling the blank '*', the sequence "A B B * B B"
+// becomes "A B B" if merge_repeated = True and "A B B B B" if
+// merge_repeated = False.
+//
+// Regardless of the value of merge_repeated, if the maximum index of a given
+// time and batch corresponds to the blank, index `(num_classes - 1)`, no new
+// element is emitted.
+//
+// Arguments:
+//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
+//	sequence_length: A vector containing sequence lengths, size `(batch_size)`.
+//
+// Returns Indices matrix, size `(total_decoded_outputs x 2)`,
+// of a `SparseTensor<int64, 2>`.  The rows store: [batch, time].Values vector, size: `(total_decoded_outputs)`,
+// of a `SparseTensor<int64, 2>`.  The vector stores the decoded classes.Shape vector, size `(2)`, of the decoded SparseTensor.
+// Values are: `[batch_size, max_decoded_length]`.Matrix, size `(batch_size x 1)`, containing sequence
+// log-probabilities.
+func CTCGreedyDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, optional ...CTCGreedyDecoderAttr) (decoded_indices tf.Output, decoded_values tf.Output, decoded_shape tf.Output, log_probability tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CTCGreedyDecoder",
+		Input: []tf.Input{
+			inputs, sequence_length,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
 // CTCLossAttr is an optional argument to CTCLoss.
 type CTCLossAttr func(optionalAttr)
 
@@ -5073,107 +5233,6 @@ func CTCLoss(scope *Scope, inputs tf.Output, labels_indices tf.Output, labels_va
 	return op.Output(0), op.Output(1)
 }
 
-// CudnnRNNCanonicalToParamsAttr is an optional argument to CudnnRNNCanonicalToParams.
-type CudnnRNNCanonicalToParamsAttr func(optionalAttr)
-
-// CudnnRNNCanonicalToParamsRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNCanonicalToParamsRnnMode(value string) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNCanonicalToParamsInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNCanonicalToParamsInputMode(value string) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
-
-// CudnnRNNCanonicalToParamsDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNCanonicalToParamsDirection(value string) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
-
-// CudnnRNNCanonicalToParamsDropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNCanonicalToParamsDropout(value float32) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
-	}
-}
-
-// CudnnRNNCanonicalToParamsSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNCanonicalToParamsSeed(value int64) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// CudnnRNNCanonicalToParamsSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNCanonicalToParamsSeed2(value int64) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Converts CudnnRNN params from canonical form to usable form.
-//
-// Writes a set of weights into the opaque params buffer so they can be used in
-// upcoming training or inferences.
-//
-// Note that the params buffer may not be compatible across different GPUs. So any
-// save and restoration should be converted to and from the canonical weights and
-// biases.
-//
-// num_layers: Specifies the number of layers in the RNN model.
-// num_units: Specifies the size of the hidden state.
-// input_size: Specifies the size of the input state.
-// weights: the canonical form of weights that can be used for saving
-//     and restoration. They are more likely to be compatible across different
-//     generations.
-// biases: the canonical form of biases that can be used for saving
-//     and restoration. They are more likely to be compatible across different
-//     generations.
-// num_params: number of parameter sets for all layers.
-//     Each layer may contain multiple parameter sets, with each set consisting of
-//     a weight matrix and a bias vector.
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//     The actual computation before the first layer. 'skip_input' is only allowed
-//     when input_size == num_units; 'auto_select' implies 'skip_input' when
-//     input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used.
-//     dir = (direction == bidirectional) ? 2 : 1
-// dropout: dropout probability. When set to 0., dropout is disabled.
-// seed: the 1st part of a seed to initialize dropout.
-// seed2: the 2nd part of a seed to initialize dropout.
-func CudnnRNNCanonicalToParams(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, weights []tf.Output, biases []tf.Output, optional ...CudnnRNNCanonicalToParamsAttr) (params tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CudnnRNNCanonicalToParams",
-		Input: []tf.Input{
-			num_layers, num_units, input_size, tf.OutputList(weights), tf.OutputList(biases),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // CudnnRNNParamsToCanonicalAttr is an optional argument to CudnnRNNParamsToCanonical.
 type CudnnRNNParamsToCanonicalAttr func(optionalAttr)
 
@@ -5288,113 +5347,99 @@ func CudnnRNNParamsToCanonical(scope *Scope, num_layers tf.Output, num_units tf.
 	return weights, biases
 }
 
-// CudnnRNNV3Attr is an optional argument to CudnnRNNV3.
-type CudnnRNNV3Attr func(optionalAttr)
+// CudnnRNNBackpropAttr is an optional argument to CudnnRNNBackprop.
+type CudnnRNNBackpropAttr func(optionalAttr)
 
-// CudnnRNNV3RnnMode sets the optional rnn_mode attribute to value.
+// CudnnRNNBackpropRnnMode sets the optional rnn_mode attribute to value.
 // If not specified, defaults to "lstm"
-func CudnnRNNV3RnnMode(value string) CudnnRNNV3Attr {
+func CudnnRNNBackpropRnnMode(value string) CudnnRNNBackpropAttr {
 	return func(m optionalAttr) {
 		m["rnn_mode"] = value
 	}
 }
 
-// CudnnRNNV3InputMode sets the optional input_mode attribute to value.
+// CudnnRNNBackpropInputMode sets the optional input_mode attribute to value.
 // If not specified, defaults to "linear_input"
-func CudnnRNNV3InputMode(value string) CudnnRNNV3Attr {
+func CudnnRNNBackpropInputMode(value string) CudnnRNNBackpropAttr {
 	return func(m optionalAttr) {
 		m["input_mode"] = value
 	}
 }
 
-// CudnnRNNV3Direction sets the optional direction attribute to value.
+// CudnnRNNBackpropDirection sets the optional direction attribute to value.
 // If not specified, defaults to "unidirectional"
-func CudnnRNNV3Direction(value string) CudnnRNNV3Attr {
+func CudnnRNNBackpropDirection(value string) CudnnRNNBackpropAttr {
 	return func(m optionalAttr) {
 		m["direction"] = value
 	}
 }
 
-// CudnnRNNV3Dropout sets the optional dropout attribute to value.
+// CudnnRNNBackpropDropout sets the optional dropout attribute to value.
 // If not specified, defaults to 0
-func CudnnRNNV3Dropout(value float32) CudnnRNNV3Attr {
+func CudnnRNNBackpropDropout(value float32) CudnnRNNBackpropAttr {
 	return func(m optionalAttr) {
 		m["dropout"] = value
 	}
 }
 
-// CudnnRNNV3Seed sets the optional seed attribute to value.
+// CudnnRNNBackpropSeed sets the optional seed attribute to value.
 // If not specified, defaults to 0
-func CudnnRNNV3Seed(value int64) CudnnRNNV3Attr {
+func CudnnRNNBackpropSeed(value int64) CudnnRNNBackpropAttr {
 	return func(m optionalAttr) {
 		m["seed"] = value
 	}
 }
 
-// CudnnRNNV3Seed2 sets the optional seed2 attribute to value.
+// CudnnRNNBackpropSeed2 sets the optional seed2 attribute to value.
 // If not specified, defaults to 0
-func CudnnRNNV3Seed2(value int64) CudnnRNNV3Attr {
+func CudnnRNNBackpropSeed2(value int64) CudnnRNNBackpropAttr {
 	return func(m optionalAttr) {
 		m["seed2"] = value
 	}
 }
 
-// CudnnRNNV3IsTraining sets the optional is_training attribute to value.
-// If not specified, defaults to true
-func CudnnRNNV3IsTraining(value bool) CudnnRNNV3Attr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// CudnnRNNV3TimeMajor sets the optional time_major attribute to value.
-// If not specified, defaults to true
-func CudnnRNNV3TimeMajor(value bool) CudnnRNNV3Attr {
-	return func(m optionalAttr) {
-		m["time_major"] = value
-	}
-}
-
-// A RNN backed by cuDNN.
+// Backprop step of CudnnRNN.
 //
-// Computes the RNN from the input and initial states, with respect to the params
-// buffer. Accepts one extra input "sequence_lengths" than CudnnRNN.
+// Compute the backprop of both data and weights in a RNN.
 //
 // rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicates whether there is a linear projection between the input and
-//   the actual computation before the first layer. 'skip_input' is only allowed
-//   when input_size == num_units; 'auto_select' implies 'skip_input' when
-//   input_size == num_units; otherwise, it implies 'linear_input'.
+// input_mode: Indicate whether there is a linear projection between the input and
+//     the actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
 // direction: Indicates whether a bidirectional model will be used. Should be
 //   "unidirectional" or "bidirectional".
 // dropout: Dropout probability. When set to 0., dropout is disabled.
 // seed: The 1st part of a seed to initialize dropout.
 // seed2: The 2nd part of a seed to initialize dropout.
-// input: If time_major is true, this is a 3-D tensor with the shape of
-//     [seq_length, batch_size, input_size]. If time_major is false, the shape is
-//     [batch_size, seq_length, input_size].
-// input_h: If time_major is true, this is a 3-D tensor with the shape of
-//     [num_layer * dir, batch_size, num_units]. If time_major is false, the shape
-//     is [batch_size, num_layer * dir, num_units].
+// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
 // input_c: For LSTM, a 3-D tensor with the shape of
 //     [num_layer * dir, batch, num_units]. For other models, it is ignored.
 // params: A 1-D tensor that contains the weights and biases in an opaque layout.
 //     The size must be created through CudnnRNNParamsSize, and initialized
 //     separately. Note that they might not be compatible across different
 //     generations. So it is a good idea to save and restore
-// sequence_lengths: a vector of lengths of each input sequence.
-// output: If time_major is true, this is a 3-D tensor with the shape of
-//     [seq_length, batch_size, dir * num_units]. If time_major is false, the
-//     shape is [batch_size, seq_length, dir * num_units].
+// output: A 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
 // output_h: The same shape has input_h.
 // output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-// is_training: Indicates whether this operation is used for inferenece or
-//   training.
-// time_major: Indicates whether the input/output format is time major or batch
-//     major.
-// reserve_space: An opaque tensor that can be used in backprop calculation. It
-//   is only produced if is_training is true.
-func CudnnRNNV3(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, sequence_lengths tf.Output, optional ...CudnnRNNV3Attr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output, host_reserved tf.Output) {
+// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
+// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
+//     pass.
+// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
+//     pass.
+// reserve_space: The same reserve_space produced in for forward operation.
+// input_backprop: The backprop to input in the forward pass. Has the same shape
+//     as input.
+// input_h_backprop: The backprop to input_h in the forward pass. Has the same
+//     shape as input_h.
+// input_c_backprop: The backprop to input_c in the forward pass. Has the same
+//     shape as input_c.
+// params_backprop: The backprop to the params buffer in the forward pass. Has the
+//     same shape as params.
+func CudnnRNNBackprop(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, optional ...CudnnRNNBackpropAttr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -5403,14 +5448,14 @@ func CudnnRNNV3(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Out
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CudnnRNNV3",
+		Type: "CudnnRNNBackprop",
 		Input: []tf.Input{
-			input, input_h, input_c, params, sequence_lengths,
+			input, input_h, input_c, params, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
 // CudnnRNNV2Attr is an optional argument to CudnnRNNV2.
@@ -5817,50 +5862,49 @@ func RecordInput(scope *Scope, file_pattern string, optional ...RecordInputAttr)
 	return op.Output(0)
 }
 
-// OrderedMapUnstageNoKeyAttr is an optional argument to OrderedMapUnstageNoKey.
-type OrderedMapUnstageNoKeyAttr func(optionalAttr)
+// OrderedMapClearAttr is an optional argument to OrderedMapClear.
+type OrderedMapClearAttr func(optionalAttr)
 
-// OrderedMapUnstageNoKeyCapacity sets the optional capacity attribute to value.
+// OrderedMapClearCapacity sets the optional capacity attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func OrderedMapUnstageNoKeyCapacity(value int64) OrderedMapUnstageNoKeyAttr {
+func OrderedMapClearCapacity(value int64) OrderedMapClearAttr {
 	return func(m optionalAttr) {
 		m["capacity"] = value
 	}
 }
 
-// OrderedMapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
+// OrderedMapClearMemoryLimit sets the optional memory_limit attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func OrderedMapUnstageNoKeyMemoryLimit(value int64) OrderedMapUnstageNoKeyAttr {
+func OrderedMapClearMemoryLimit(value int64) OrderedMapClearAttr {
 	return func(m optionalAttr) {
 		m["memory_limit"] = value
 	}
 }
 
-// OrderedMapUnstageNoKeyContainer sets the optional container attribute to value.
+// OrderedMapClearContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func OrderedMapUnstageNoKeyContainer(value string) OrderedMapUnstageNoKeyAttr {
+func OrderedMapClearContainer(value string) OrderedMapClearAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// OrderedMapUnstageNoKeySharedName sets the optional shared_name attribute to value.
+// OrderedMapClearSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func OrderedMapUnstageNoKeySharedName(value string) OrderedMapUnstageNoKeyAttr {
+func OrderedMapClearSharedName(value string) OrderedMapClearAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Op removes and returns the (key, value) element with the smallest
+// Op removes all elements in the underlying container.
 //
-// key from the underlying container.   If the underlying container
-// does not contain elements, the op will block until it does.
-func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
+// Returns the created operation.
+func OrderedMapClear(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapClearAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -5869,70 +5913,54 @@ func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataTyp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapUnstageNoKey",
-		Input: []tf.Input{
-			indices,
-		},
+		Type: "OrderedMapClear",
+
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	key = op.Output(idx)
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("OrderedMapUnstageNoKey", err)
-		return
-	}
-	return key, values
+	return scope.AddOperation(opspec)
 }
 
-// OrderedMapUnstageAttr is an optional argument to OrderedMapUnstage.
-type OrderedMapUnstageAttr func(optionalAttr)
+// OrderedMapSizeAttr is an optional argument to OrderedMapSize.
+type OrderedMapSizeAttr func(optionalAttr)
 
-// OrderedMapUnstageCapacity sets the optional capacity attribute to value.
+// OrderedMapSizeCapacity sets the optional capacity attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func OrderedMapUnstageCapacity(value int64) OrderedMapUnstageAttr {
+func OrderedMapSizeCapacity(value int64) OrderedMapSizeAttr {
 	return func(m optionalAttr) {
 		m["capacity"] = value
 	}
 }
 
-// OrderedMapUnstageMemoryLimit sets the optional memory_limit attribute to value.
+// OrderedMapSizeMemoryLimit sets the optional memory_limit attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func OrderedMapUnstageMemoryLimit(value int64) OrderedMapUnstageAttr {
+func OrderedMapSizeMemoryLimit(value int64) OrderedMapSizeAttr {
 	return func(m optionalAttr) {
 		m["memory_limit"] = value
 	}
 }
 
-// OrderedMapUnstageContainer sets the optional container attribute to value.
+// OrderedMapSizeContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func OrderedMapUnstageContainer(value string) OrderedMapUnstageAttr {
+func OrderedMapSizeContainer(value string) OrderedMapSizeAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// OrderedMapUnstageSharedName sets the optional shared_name attribute to value.
+// OrderedMapSizeSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func OrderedMapUnstageSharedName(value string) OrderedMapUnstageAttr {
+func OrderedMapSizeSharedName(value string) OrderedMapSizeAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Op removes and returns the values associated with the key
-//
-// from the underlying container.   If the underlying container
-// does not contain this key, the op will block until it does.
-func OrderedMapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageAttr) (values []tf.Output) {
+// Op returns the number of elements in the underlying container.
+func OrderedMapSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -5941,95 +5969,12 @@ func OrderedMapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapUnstage",
-		Input: []tf.Input{
-			key, indices,
-		},
+		Type: "OrderedMapSize",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("OrderedMapUnstage", err)
-		return
-	}
-	return values
-}
-
-// OrderedMapPeekAttr is an optional argument to OrderedMapPeek.
-type OrderedMapPeekAttr func(optionalAttr)
-
-// OrderedMapPeekCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapPeekCapacity(value int64) OrderedMapPeekAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// OrderedMapPeekMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapPeekMemoryLimit(value int64) OrderedMapPeekAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapPeekContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapPeekContainer(value string) OrderedMapPeekAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapPeekSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapPeekSharedName(value string) OrderedMapPeekAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op peeks at the values at the specified key.  If the
-//
-// underlying container does not contain this key
-// this op will block until it does.   This Op is optimized for
-// performance.
-func OrderedMapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapPeekAttr) (values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "OrderedMapPeek",
-		Input: []tf.Input{
-			key, indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("OrderedMapPeek", err)
-		return
-	}
-	return values
+	return op.Output(0)
 }
 
 // OrderedMapStageAttr is an optional argument to OrderedMapStage.
@@ -6109,47 +6054,47 @@ func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf
 	return scope.AddOperation(opspec)
 }
 
-// MapIncompleteSizeAttr is an optional argument to MapIncompleteSize.
-type MapIncompleteSizeAttr func(optionalAttr)
+// MapSizeAttr is an optional argument to MapSize.
+type MapSizeAttr func(optionalAttr)
 
-// MapIncompleteSizeCapacity sets the optional capacity attribute to value.
+// MapSizeCapacity sets the optional capacity attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func MapIncompleteSizeCapacity(value int64) MapIncompleteSizeAttr {
+func MapSizeCapacity(value int64) MapSizeAttr {
 	return func(m optionalAttr) {
 		m["capacity"] = value
 	}
 }
 
-// MapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
+// MapSizeMemoryLimit sets the optional memory_limit attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func MapIncompleteSizeMemoryLimit(value int64) MapIncompleteSizeAttr {
+func MapSizeMemoryLimit(value int64) MapSizeAttr {
 	return func(m optionalAttr) {
 		m["memory_limit"] = value
 	}
 }
 
-// MapIncompleteSizeContainer sets the optional container attribute to value.
+// MapSizeContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func MapIncompleteSizeContainer(value string) MapIncompleteSizeAttr {
+func MapSizeContainer(value string) MapSizeAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// MapIncompleteSizeSharedName sets the optional shared_name attribute to value.
+// MapSizeSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func MapIncompleteSizeSharedName(value string) MapIncompleteSizeAttr {
+func MapSizeSharedName(value string) MapSizeAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Op returns the number of incomplete elements in the underlying container.
-func MapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...MapIncompleteSizeAttr) (size tf.Output) {
+// Op returns the number of elements in the underlying container.
+func MapSize(scope *Scope, dtypes []tf.DataType, optional ...MapSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6158,7 +6103,7 @@ func MapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...MapIncomp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapIncompleteSize",
+		Type: "MapSize",
 
 		Attrs: attrs,
 	}
@@ -6238,6 +6183,148 @@ func MapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, opti
 	return key, values
 }
 
+// MapUnstageAttr is an optional argument to MapUnstage.
+type MapUnstageAttr func(optionalAttr)
+
+// MapUnstageCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapUnstageCapacity(value int64) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapUnstageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapUnstageMemoryLimit(value int64) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapUnstageContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapUnstageContainer(value string) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapUnstageSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapUnstageSharedName(value string) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns the values associated with the key
+//
+// from the underlying container.   If the underlying container
+// does not contain this key, the op will block until it does.
+func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageAttr) (values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapUnstage",
+		Input: []tf.Input{
+			key, indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapUnstage", err)
+		return
+	}
+	return values
+}
+
+// MapPeekAttr is an optional argument to MapPeek.
+type MapPeekAttr func(optionalAttr)
+
+// MapPeekCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapPeekCapacity(value int64) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapPeekMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapPeekMemoryLimit(value int64) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapPeekContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapPeekContainer(value string) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapPeekSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapPeekSharedName(value string) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op peeks at the values at the specified key.  If the
+//
+// underlying container does not contain this key
+// this op will block until it does.
+func MapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapPeekAttr) (values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapPeek",
+		Input: []tf.Input{
+			key, indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapPeek", err)
+		return
+	}
+	return values
+}
+
 // MapStageAttr is an optional argument to MapStage.
 type MapStageAttr func(optionalAttr)
 
@@ -6313,6 +6400,87 @@ func MapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output
 	return scope.AddOperation(opspec)
 }
 
+// StageClearAttr is an optional argument to StageClear.
+type StageClearAttr func(optionalAttr)
+
+// StageClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageClearCapacity(value int64) StageClearAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// StageClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageClearMemoryLimit(value int64) StageClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// StageClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StageClearContainer(value string) StageClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// StageClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StageClearSharedName(value string) StageClearAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes all elements in the underlying container.
+//
+// Returns the created operation.
+func StageClear(scope *Scope, dtypes []tf.DataType, optional ...StageClearAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StageClear",
+
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Deserializes a serialized tree ensemble config and replaces current tree
+//
+// ensemble.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble.
+//	stamp_token: Token to use as the new value of the resource stamp.
+//	tree_ensemble_serialized: Serialized proto of the ensemble.
+//
+// Returns the created operation.
+func BoostedTreesDeserializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesDeserializeEnsemble",
+		Input: []tf.Input{
+			tree_ensemble_handle, stamp_token, tree_ensemble_serialized,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // StageSizeAttr is an optional argument to StageSize.
 type StageSizeAttr func(optionalAttr)
 
@@ -6370,50 +6538,50 @@ func StageSize(scope *Scope, dtypes []tf.DataType, optional ...StageSizeAttr) (s
 	return op.Output(0)
 }
 
-// UnstageAttr is an optional argument to Unstage.
-type UnstageAttr func(optionalAttr)
+// OrderedMapUnstageAttr is an optional argument to OrderedMapUnstage.
+type OrderedMapUnstageAttr func(optionalAttr)
 
-// UnstageCapacity sets the optional capacity attribute to value.
+// OrderedMapUnstageCapacity sets the optional capacity attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func UnstageCapacity(value int64) UnstageAttr {
+func OrderedMapUnstageCapacity(value int64) OrderedMapUnstageAttr {
 	return func(m optionalAttr) {
 		m["capacity"] = value
 	}
 }
 
-// UnstageMemoryLimit sets the optional memory_limit attribute to value.
+// OrderedMapUnstageMemoryLimit sets the optional memory_limit attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func UnstageMemoryLimit(value int64) UnstageAttr {
+func OrderedMapUnstageMemoryLimit(value int64) OrderedMapUnstageAttr {
 	return func(m optionalAttr) {
 		m["memory_limit"] = value
 	}
 }
 
-// UnstageContainer sets the optional container attribute to value.
+// OrderedMapUnstageContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func UnstageContainer(value string) UnstageAttr {
+func OrderedMapUnstageContainer(value string) OrderedMapUnstageAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// UnstageSharedName sets the optional shared_name attribute to value.
+// OrderedMapUnstageSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func UnstageSharedName(value string) UnstageAttr {
+func OrderedMapUnstageSharedName(value string) OrderedMapUnstageAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Op is similar to a lightweight Dequeue.
+// Op removes and returns the values associated with the key
 //
-// The basic functionality is similar to dequeue with many fewer
-// capabilities and options.  This Op is optimized for performance.
-func Unstage(scope *Scope, dtypes []tf.DataType, optional ...UnstageAttr) (values []tf.Output) {
+// from the underlying container.   If the underlying container
+// does not contain this key, the op will block until it does.
+func OrderedMapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6422,8 +6590,10 @@ func Unstage(scope *Scope, dtypes []tf.DataType, optional ...UnstageAttr) (value
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Unstage",
-
+		Type: "OrderedMapUnstage",
+		Input: []tf.Input{
+			key, indices,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
@@ -6433,7 +6603,79 @@ func Unstage(scope *Scope, dtypes []tf.DataType, optional ...UnstageAttr) (value
 	var idx int
 	var err error
 	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("Unstage", err)
+		scope.UpdateErr("OrderedMapUnstage", err)
+		return
+	}
+	return values
+}
+
+// StagePeekAttr is an optional argument to StagePeek.
+type StagePeekAttr func(optionalAttr)
+
+// StagePeekCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StagePeekCapacity(value int64) StagePeekAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// StagePeekMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StagePeekMemoryLimit(value int64) StagePeekAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// StagePeekContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StagePeekContainer(value string) StagePeekAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// StagePeekSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StagePeekSharedName(value string) StagePeekAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op peeks at the values at the specified index.  If the
+//
+// underlying container does not contain sufficient elements
+// this op will block until it does.   This Op is optimized for
+// performance.
+func StagePeek(scope *Scope, index tf.Output, dtypes []tf.DataType, optional ...StagePeekAttr) (values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StagePeek",
+		Input: []tf.Input{
+			index,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("StagePeek", err)
 		return
 	}
 	return values
@@ -6517,131 +6759,6 @@ func Stage(scope *Scope, values []tf.Output, optional ...StageAttr) (o *tf.Opera
 	return scope.AddOperation(opspec)
 }
 
-// Delete the tensor specified by its handle in the session.
-//
-// Arguments:
-//	handle: The handle for a tensor stored in the session state.
-//
-// Returns the created operation.
-func DeleteSessionTensor(scope *Scope, handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DeleteSessionTensor",
-		Input: []tf.Input{
-			handle,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-//     Updates specified rows with values in `v`.
-//
-//     Computes `x[i, :] = v; return x`.
-//
-// Arguments:
-//	x: A tensor of type `T`.
-//	i: A vector. Indices into the left-most dimension of `x`.
-//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
-//
-// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
-func InplaceUpdate(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InplaceUpdate",
-		Input: []tf.Input{
-			x, i, v,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ComputeAccidentalHitsAttr is an optional argument to ComputeAccidentalHits.
-type ComputeAccidentalHitsAttr func(optionalAttr)
-
-// ComputeAccidentalHitsSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ComputeAccidentalHitsSeed(value int64) ComputeAccidentalHitsAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// ComputeAccidentalHitsSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ComputeAccidentalHitsSeed2(value int64) ComputeAccidentalHitsAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Computes the ids of the positions in sampled_candidates that match true_labels.
-//
-// When doing log-odds NCE, the result of this op should be passed through a
-// SparseToDense op, then added to the logits of the sampled candidates. This has
-// the effect of 'removing' the sampled labels that match the true labels by
-// making the classifier sure that they are sampled labels.
-//
-// Arguments:
-//	true_classes: The true_classes output of UnpackSparseLabels.
-//	sampled_candidates: The sampled_candidates output of CandidateSampler.
-//	num_true: Number of true labels per context.
-//
-// Returns A vector of indices corresponding to rows of true_candidates.A vector of IDs of positions in sampled_candidates that match a true_label
-// for the row with the corresponding index in indices.A vector of the same length as indices and ids, in which each element
-// is -FLOAT_MAX.
-func ComputeAccidentalHits(scope *Scope, true_classes tf.Output, sampled_candidates tf.Output, num_true int64, optional ...ComputeAccidentalHitsAttr) (indices tf.Output, ids tf.Output, weights tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_true": num_true}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ComputeAccidentalHits",
-		Input: []tf.Input{
-			true_classes, sampled_candidates,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Get the value of the tensor specified by its handle.
-//
-// Arguments:
-//	handle: The handle for a tensor stored in the session state.
-//	dtype: The type of the output value.
-//
-// Returns The tensor for the given handle.
-func GetSessionTensor(scope *Scope, handle tf.Output, dtype tf.DataType) (value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "GetSessionTensor",
-		Input: []tf.Input{
-			handle,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Store the input tensor in the state of the current session.
 //
 // Arguments:
@@ -6663,6 +6780,61 @@ func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
 	return op.Output(0)
 }
 
+// Pads a tensor with mirrored values.
+//
+// This operation pads a `input` with mirrored values according to the `paddings`
+// you specify. `paddings` is an integer tensor with shape `[n, 2]`, where n is
+// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many values to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many values to add after the contents of `input`
+// in that dimension. Both `paddings[D, 0]` and `paddings[D, 1]` must be no greater
+// than `input.dim_size(D)` (or `input.dim_size(D) - 1`) if `copy_border` is true
+// (if false, respectively).
+//
+// The padded size of each dimension D of the output is:
+//
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 2, 3], [4, 5, 6]].
+// # 'paddings' is [[1, 1]], [2, 2]].
+// # 'mode' is SYMMETRIC.
+// # rank of 't' is 2.
+// pad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]
+//                       [2, 1, 1, 2, 3, 3, 2]
+//                       [5, 4, 4, 5, 6, 6, 5]
+//                       [5, 4, 4, 5, 6, 6, 5]]
+// ```
+//
+// Arguments:
+//	input: The input tensor to be padded.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	mode: Either `REFLECT` or `SYMMETRIC`. In reflect mode the padded regions
+// do not include the borders, while in symmetric mode the padded regions
+// do include the borders. For example, if `input` is `[1, 2, 3]` and `paddings`
+// is `[0, 2]`, then the output is `[1, 2, 3, 2, 1]` in reflect mode, and
+// it is `[1, 2, 3, 3, 2]` in symmetric mode.
+//
+// Returns The padded tensor.
+func MirrorPad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"mode": mode}
+	opspec := tf.OpSpec{
+		Type: "MirrorPad",
+		Input: []tf.Input{
+			input, paddings,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Deprecated. Use TensorArrayCloseV3
 //
 // DEPRECATED at GraphDef version 26: Use TensorArrayCloseV3
@@ -6681,37 +6853,6 @@ func TensorArrayCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	return scope.AddOperation(opspec)
 }
 
-// TensorArrayConcatV2Attr is an optional argument to TensorArrayConcatV2.
-type TensorArrayConcatV2Attr func(optionalAttr)
-
-// TensorArrayConcatV2ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayConcatV2ElementShapeExcept0(value tf.Shape) TensorArrayConcatV2Attr {
-	return func(m optionalAttr) {
-		m["element_shape_except0"] = value
-	}
-}
-
-// Deprecated. Use TensorArrayConcatV3
-func TensorArrayConcatV2(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV2Attr) (value tf.Output, lengths tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayConcatV2",
-		Input: []tf.Input{
-			handle, flow_in,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
 // Deprecated. Use TensorArrayScatterV3
 //
 // DEPRECATED at GraphDef version 26: Use TensorArrayScatterV3
@@ -6729,57 +6870,52 @@ func TensorArrayScatterV2(scope *Scope, handle tf.Output, indices tf.Output, val
 	return op.Output(0)
 }
 
-// MapSizeAttr is an optional argument to MapSize.
-type MapSizeAttr func(optionalAttr)
+// TensorArrayGatherV2Attr is an optional argument to TensorArrayGatherV2.
+type TensorArrayGatherV2Attr func(optionalAttr)
 
-// MapSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// TensorArrayGatherV2ElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayGatherV2ElementShape(value tf.Shape) TensorArrayGatherV2Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// Deprecated. Use TensorArrayGatherV3
 //
-// REQUIRES: value >= 0
-func MapSizeCapacity(value int64) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapSizeMemoryLimit(value int64) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapSizeContainer(value string) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapSizeSharedName(value string) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op returns the number of elements in the underlying container.
-func MapSize(scope *Scope, dtypes []tf.DataType, optional ...MapSizeAttr) (size tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayGatherV3
+func TensorArrayGatherV2(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV2Attr) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapSize",
+		Type: "TensorArrayGatherV2",
+		Input: []tf.Input{
+			handle, indices, flow_in,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
+// Deprecated. Use TensorArrayReadV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArrayReadV3
+func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayReadV2",
+		Input: []tf.Input{
+			handle, index, flow_in,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
@@ -6805,6 +6941,63 @@ func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source
 	return op.Output(0)
 }
 
+// MapIncompleteSizeAttr is an optional argument to MapIncompleteSize.
+type MapIncompleteSizeAttr func(optionalAttr)
+
+// MapIncompleteSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapIncompleteSizeCapacity(value int64) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapIncompleteSizeMemoryLimit(value int64) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapIncompleteSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapIncompleteSizeContainer(value string) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapIncompleteSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapIncompleteSizeSharedName(value string) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of incomplete elements in the underlying container.
+func MapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...MapIncompleteSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapIncompleteSize",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Delete the TensorArray from its resource container.
 //
 // This enables the user to close and release the resource in the middle
@@ -6948,45 +7141,24 @@ func TensorArrayConcatV3(scope *Scope, handle tf.Output, flow_in tf.Output, dtyp
 	return op.Output(0), op.Output(1)
 }
 
-// TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
-type TensorArrayGatherV3Attr func(optionalAttr)
-
-// TensorArrayGatherV3ElementShape sets the optional element_shape attribute to value.
-//
-// value: The expected shape of an element, if known. Used to
-// validate the shapes of TensorArray elements. If this shape is not
-// fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
-	}
-}
-
-// Gather specific elements from the TensorArray into output `value`.
-//
-// All elements selected by `indices` must have the same shape.
+// Read an element from the TensorArray into output `value`.
 //
 // Arguments:
 //	handle: The handle to a TensorArray.
-//	indices: The locations in the TensorArray from which to read tensor elements.
+//
 //	flow_in: A float scalar that enforces proper chaining of operations.
 //	dtype: The type of the elem that is returned.
 //
-// Returns All of the elements in the TensorArray, concatenated along a new
-// axis (the new dimension 0).
-func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV3Attr) (value tf.Output) {
+// Returns The tensor that is read from the TensorArray.
+func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGatherV3",
+		Type: "TensorArrayReadV3",
 		Input: []tf.Input{
-			handle, indices, flow_in,
+			handle, index, flow_in,
 		},
 		Attrs: attrs,
 	}
@@ -7017,79 +7189,6 @@ func TensorArrayWriteV3(scope *Scope, handle tf.Output, index tf.Output, value t
 	return op.Output(0)
 }
 
-// QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
-type QuantizeAndDequantizeV3Attr func(optionalAttr)
-
-// QuantizeAndDequantizeV3SignedInput sets the optional signed_input attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeV3SignedInput(value bool) QuantizeAndDequantizeV3Attr {
-	return func(m optionalAttr) {
-		m["signed_input"] = value
-	}
-}
-
-// QuantizeAndDequantizeV3RangeGiven sets the optional range_given attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeV3RangeGiven(value bool) QuantizeAndDequantizeV3Attr {
-	return func(m optionalAttr) {
-		m["range_given"] = value
-	}
-}
-
-// Quantizes then dequantizes a tensor.
-//
-// This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
-// tensor, so its value can change during training.
-func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, num_bits tf.Output, optional ...QuantizeAndDequantizeV3Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantizeV3",
-		Input: []tf.Input{
-			input, input_min, input_max, num_bits,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a TensorArray for storing multiple gradients of values in the given handle.
-//
-// Similar to TensorArrayGradV3. However it creates an accumulator with an
-// expanded shape compared to the input TensorArray whose gradient is being
-// computed. This enables multiple gradients for the same TensorArray to be
-// calculated using the same accumulator.
-//
-// Arguments:
-//	handle: The handle to the forward TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	shape_to_prepend: An int32 vector representing a shape. Elements in the gradient accumulator will
-// have shape which is this shape_to_prepend value concatenated with shape of the
-// elements in the TensorArray corresponding to the input handle.
-//	source: The gradient source string, used to decide which gradient TensorArray
-// to return.
-func TensorArrayGradWithShape(scope *Scope, handle tf.Output, flow_in tf.Output, shape_to_prepend tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"source": source}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayGradWithShape",
-		Input: []tf.Input{
-			handle, flow_in, shape_to_prepend,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
 // Creates a TensorArray for storing the gradients of values in the given handle.
 //
 // If the given TensorArray gradient already exists, returns a reference to it.
@@ -7243,71 +7342,167 @@ func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...
 	return op.Output(0), op.Output(1)
 }
 
-// StackV2Attr is an optional argument to StackV2.
-type StackV2Attr func(optionalAttr)
+// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
+type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
 
-// StackV2StackName sets the optional stack_name attribute to value.
+// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
 //
-// value: Overrides the name used for the temporary stack resource. Default
-// value is the name of the 'Stack' op (which is guaranteed unique).
-// If not specified, defaults to ""
-func StackV2StackName(value string) StackV2Attr {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["stack_name"] = value
+		m["seed"] = value
 	}
 }
 
-// A stack that produces elements in first-in last-out order.
+// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
 // Arguments:
-//	max_size: The maximum size of the stack if non-negative. If negative, the stack
-// size is unlimited.
-//	elem_type: The type of the elements on the stack.
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
 //
-// Returns The handle to the stack.
-func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional ...StackV2Attr) (handle tf.Output) {
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"elem_type": elem_type}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StackV2",
+		Type: "ThreadUnsafeUnigramCandidateSampler",
 		Input: []tf.Input{
-			max_size,
+			true_classes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Add the quantile summaries to each quantile stream resource.
-//
-// An op that adds a list of quantile summaries to a quantile stream resource. Each
-// summary Tensor is rank 2, containing summaries (value, weight, min_rank, max_rank)
-// for a single feature.
+// Delete the stack from its resource container.
 //
 // Arguments:
-//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
-//	summaries: string; List of Rank 2 Tensor each containing the summaries for a single feature.
+//	handle: The handle to a stack.
 //
 // Returns the created operation.
-func BoostedTreesQuantileStreamResourceAddSummaries(scope *Scope, quantile_stream_resource_handle tf.Output, summaries []tf.Output) (o *tf.Operation) {
+func StackCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesQuantileStreamResourceAddSummaries",
+		Type: "StackCloseV2",
 		Input: []tf.Input{
-			quantile_stream_resource_handle, tf.OutputList(summaries),
+			handle,
 		},
 	}
 	return scope.AddOperation(opspec)
 }
 
+// UnstageAttr is an optional argument to Unstage.
+type UnstageAttr func(optionalAttr)
+
+// UnstageCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func UnstageCapacity(value int64) UnstageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// UnstageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func UnstageMemoryLimit(value int64) UnstageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// UnstageContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func UnstageContainer(value string) UnstageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// UnstageSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func UnstageSharedName(value string) UnstageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op is similar to a lightweight Dequeue.
+//
+// The basic functionality is similar to dequeue with many fewer
+// capabilities and options.  This Op is optimized for performance.
+func Unstage(scope *Scope, dtypes []tf.DataType, optional ...UnstageAttr) (values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Unstage",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("Unstage", err)
+		return
+	}
+	return values
+}
+
 // QueueDequeueUpToV2Attr is an optional argument to QueueDequeueUpToV2.
 type QueueDequeueUpToV2Attr func(optionalAttr)
 
@@ -7377,44 +7572,36 @@ func QueueDequeueUpToV2(scope *Scope, handle tf.Output, n tf.Output, component_t
 	return components
 }
 
-// QueueDequeueManyV2Attr is an optional argument to QueueDequeueManyV2.
-type QueueDequeueManyV2Attr func(optionalAttr)
+// QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
+type QueueDequeueV2Attr func(optionalAttr)
 
-// QueueDequeueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
+// QueueDequeueV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// value: If the queue has fewer than n elements, this operation
-// will block for up to timeout_ms milliseconds.
+// value: If the queue is empty, this operation will block for up to
+// timeout_ms milliseconds.
 // Note: This option is not supported yet.
 // If not specified, defaults to -1
-func QueueDequeueManyV2TimeoutMs(value int64) QueueDequeueManyV2Attr {
+func QueueDequeueV2TimeoutMs(value int64) QueueDequeueV2Attr {
 	return func(m optionalAttr) {
 		m["timeout_ms"] = value
 	}
 }
 
-// Dequeues `n` tuples of one or more tensors from the given queue.
+// Dequeues a tuple of one or more tensors from the given queue.
 //
-// If the queue is closed and there are fewer than `n` elements, then an
-// OutOfRange error is returned.
-//
-// This operation concatenates queue-element component tensors along the
-// 0th dimension to make a single component tensor.  All of the components
-// in the dequeued tuple will have size `n` in the 0th dimension.
-//
-// This operation has `k` outputs, where `k` is the number of components in
-// the tuples stored in the given queue, and output `i` is the ith
+// This operation has k outputs, where k is the number of components
+// in the tuples stored in the given queue, and output i is the ith
 // component of the dequeued tuple.
 //
-// N.B. If the queue is empty, this operation will block until `n` elements
-// have been dequeued (or 'timeout_ms' elapses, if specified).
+// N.B. If the queue is empty, this operation will block until an element
+// has been dequeued (or 'timeout_ms' elapses, if specified).
 //
 // Arguments:
 //	handle: The handle to a queue.
-//	n: The number of tuples to dequeue.
 //	component_types: The type of each component in a tuple.
 //
 // Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueManyV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueManyV2Attr) (components []tf.Output) {
+func QueueDequeueV2(scope *Scope, handle tf.Output, component_types []tf.DataType, optional ...QueueDequeueV2Attr) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7423,9 +7610,9 @@ func QueueDequeueManyV2(scope *Scope, handle tf.Output, n tf.Output, component_t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueDequeueManyV2",
+		Type: "QueueDequeueV2",
 		Input: []tf.Input{
-			handle, n,
+			handle,
 		},
 		Attrs: attrs,
 	}
@@ -7436,12 +7623,74 @@ func QueueDequeueManyV2(scope *Scope, handle tf.Output, n tf.Output, component_t
 	var idx int
 	var err error
 	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueManyV2", err)
+		scope.UpdateErr("QueueDequeueV2", err)
 		return
 	}
 	return components
 }
 
+// SqueezeAttr is an optional argument to Squeeze.
+type SqueezeAttr func(optionalAttr)
+
+// SqueezeAxis sets the optional axis attribute to value.
+//
+// value: If specified, only squeezes the dimensions listed. The dimension
+// index starts at 0. It is an error to squeeze a dimension that is not 1. Must
+// be in the range `[-rank(input), rank(input))`.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func SqueezeAxis(value []int64) SqueezeAttr {
+	return func(m optionalAttr) {
+		m["squeeze_dims"] = value
+	}
+}
+
+// Removes dimensions of size 1 from the shape of a tensor.
+//
+// Given a tensor `input`, this operation returns a tensor of the same type with
+// all dimensions of size 1 removed. If you don't want to remove all size 1
+// dimensions, you can remove specific size 1 dimensions by specifying
+// `axis`.
+//
+// For example:
+//
+// ```
+// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+// shape(squeeze(t)) ==> [2, 3]
+// ```
+//
+// Or, to remove specific size 1 dimensions:
+//
+// ```
+// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+// shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
+// ```
+//
+// Arguments:
+//	input: The `input` to squeeze.
+//
+// Returns Contains the same data as `input`, but has one or more dimensions of
+// size 1 removed.
+func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Squeeze",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // QueueEnqueueManyV2Attr is an optional argument to QueueEnqueueManyV2.
 type QueueEnqueueManyV2Attr func(optionalAttr)
 
@@ -7493,78 +7742,234 @@ func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output,
 	return scope.AddOperation(opspec)
 }
 
-// Interleave the values from the `data` tensors into a single tensor.
+// QueueEnqueueV2Attr is an optional argument to QueueEnqueueV2.
+type QueueEnqueueV2Attr func(optionalAttr)
+
+// QueueEnqueueV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// Builds a merged tensor such that
+// value: If the queue is full, this operation will block for up to
+// timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueEnqueueV2TimeoutMs(value int64) QueueEnqueueV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_ms"] = value
+	}
+}
+
+// Enqueues a tuple of one or more tensors in the given queue.
 //
-// ```python
-//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
-// ```
+// The components input has k elements, which correspond to the components of
+// tuples stored in the given queue.
 //
-// For example, if each `indices[m]` is scalar or vector, we have
+// N.B. If the queue is full, this operation will block until the given
+// element has been enqueued (or 'timeout_ms' elapses, if specified).
 //
-// ```python
-//     # Scalar indices:
-//     merged[indices[m], ...] = data[m][...]
+// Arguments:
+//	handle: The handle to a queue.
+//	components: One or more tensors from which the enqueued tensors should be taken.
 //
-//     # Vector indices:
-//     merged[indices[m][i], ...] = data[m][i, ...]
-// ```
-//
-// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
-// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
-// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
-// `constant`, the output shape is
-//
-//     merged.shape = [max(indices)] + constant
-//
-// Values may be merged in parallel, so if an index appears in both `indices[m][i]`
-// and `indices[n][j]`, the result may be invalid. This differs from the normal
-// DynamicStitch operator that defines the behavior in that case.
-//
-// For example:
-//
-// ```python
-//     indices[0] = 6
-//     indices[1] = [4, 1]
-//     indices[2] = [[5, 2], [0, 3]]
-//     data[0] = [61, 62]
-//     data[1] = [[41, 42], [11, 12]]
-//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
-//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
-//               [51, 52], [61, 62]]
-// ```
-//
-// This method can be used to merge partitions created by `dynamic_partition`
-// as illustrated on the following example:
-//
-// ```python
-//     # Apply function (increments x_i) on elements for which a certain condition
-//     # apply (x_i != -1 in this example).
-//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
-//     condition_mask=tf.not_equal(x,tf.constant(-1.))
-//     partitioned_data = tf.dynamic_partition(
-//         x, tf.cast(condition_mask, tf.int32) , 2)
-//     partitioned_data[1] = partitioned_data[1] + 1.0
-//     condition_indices = tf.dynamic_partition(
-//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
-//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
-//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
-//     # unchanged.
-// ```
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
-// </div>
-func ParallelDynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
+// Returns the created operation.
+func QueueEnqueueV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ParallelDynamicStitch",
+		Type: "QueueEnqueueV2",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(data),
+			handle, tf.OutputList(components),
 		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// PriorityQueueV2Attr is an optional argument to PriorityQueueV2.
+type PriorityQueueV2Attr func(optionalAttr)
+
+// PriorityQueueV2ComponentTypes sets the optional component_types attribute to value.
+//
+// value: The type of each component in a value.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func PriorityQueueV2ComponentTypes(value []tf.DataType) PriorityQueueV2Attr {
+	return func(m optionalAttr) {
+		m["component_types"] = value
+	}
+}
+
+// PriorityQueueV2Capacity sets the optional capacity attribute to value.
+//
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func PriorityQueueV2Capacity(value int64) PriorityQueueV2Attr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// PriorityQueueV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func PriorityQueueV2Container(value string) PriorityQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// PriorityQueueV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func PriorityQueueV2SharedName(value string) PriorityQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A queue that produces elements sorted by the first component value.
+//
+// Note that the PriorityQueue requires the first component of any element
+// to be a scalar int64, in addition to the other elements declared by
+// component_types.  Therefore calls to Enqueue and EnqueueMany (resp. Dequeue
+// and DequeueMany) on a PriorityQueue will all require (resp. output) one extra
+// entry in their input (resp. output) lists.
+//
+// Arguments:
+//	shapes: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+//
+// Returns The handle to the queue.
+func PriorityQueueV2(scope *Scope, shapes []tf.Shape, optional ...PriorityQueueV2Attr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shapes": shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "PriorityQueueV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
+type RandomShuffleQueueV2Attr func(optionalAttr)
+
+// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
+//
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shapes"] = value
+	}
+}
+
+// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
+//
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
+//
+// value: Dequeue will block unless there would be this
+// many elements after the dequeue or the queue is closed. This
+// ensures a minimum level of mixing of elements.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["min_after_dequeue"] = value
+	}
+}
+
+// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 is set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// RandomShuffleQueueV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A queue that randomizes the order of elements.
+//
+// Arguments:
+//	component_types: The type of each component in a value.
+//
+// Returns The handle to the queue.
+func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomShuffleQueueV2",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -7648,23 +8053,40 @@ func DynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged
 	return op.Output(0)
 }
 
-// Delete the stack from its resource container.
+// Computes the inverse permutation of a tensor.
+//
+// This operation computes the inverse of an index permutation. It takes a 1-D
+// integer tensor `x`, which represents the indices of a zero-based array, and
+// swaps each value with its index position. In other words, for an output tensor
+// `y` and an input tensor `x`, this operation computes the following:
+//
+// `y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`
+//
+// The values must include 0. There can be no duplicate values or negative values.
+//
+// For example:
+//
+// ```
+// # tensor `x` is [3, 4, 0, 2, 1]
+// invert_permutation(x) ==> [2, 4, 3, 0, 1]
+// ```
 //
 // Arguments:
-//	handle: The handle to a stack.
+//	x: 1-D.
 //
-// Returns the created operation.
-func StackCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
+// Returns 1-D.
+func InvertPermutation(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "StackCloseV2",
+		Type: "InvertPermutation",
 		Input: []tf.Input{
-			handle,
+			x,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
 // Partitions `data` into `num_partitions` tensors using indices from `partitions`.
@@ -7788,137 +8210,20 @@ func MultiDeviceIteratorFromStringHandle(scope *Scope, string_handle tf.Output,
 	return op.Output(0)
 }
 
-// Gets next element for the provided shard number.
+// Produces a string handle for the given MultiDeviceIterator.
 //
 // Arguments:
 //	multi_device_iterator: A MultiDeviceIterator resource.
-//	shard_num: Integer representing which shard to fetch data for.
-//	incarnation_id: Which incarnation of the MultiDeviceIterator is running.
-//	output_types: The type list for the return values.
-//	output_shapes: The list of shapes being produced.
 //
-// Returns Result of the get_next on the dataset.
-func MultiDeviceIteratorGetNextFromShard(scope *Scope, multi_device_iterator tf.Output, shard_num tf.Output, incarnation_id tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "MultiDeviceIteratorGetNextFromShard",
-		Input: []tf.Input{
-			multi_device_iterator, shard_num, incarnation_id,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("MultiDeviceIteratorGetNextFromShard", err)
-		return
-	}
-	return components
-}
-
-// Initializes the multi device iterator with the given dataset.
-//
-// Arguments:
-//	dataset: Dataset to be iterated upon.
-//	multi_device_iterator: A MultiDeviceIteratorResource.
-//	max_buffer_size: The maximum size of the host side per device buffer to keep.
-//
-// Returns An int64 indicating which incarnation of the MultiDeviceIterator
-// is running.
-func MultiDeviceIteratorInit(scope *Scope, dataset tf.Output, multi_device_iterator tf.Output, max_buffer_size tf.Output) (incarnation_id tf.Output) {
+// Returns A string representing the resource.
+func MultiDeviceIteratorToStringHandle(scope *Scope, multi_device_iterator tf.Output) (string_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MultiDeviceIteratorInit",
+		Type: "MultiDeviceIteratorToStringHandle",
 		Input: []tf.Input{
-			dataset, multi_device_iterator, max_buffer_size,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the inverse permutation of a tensor.
-//
-// This operation computes the inverse of an index permutation. It takes a 1-D
-// integer tensor `x`, which represents the indices of a zero-based array, and
-// swaps each value with its index position. In other words, for an output tensor
-// `y` and an input tensor `x`, this operation computes the following:
-//
-// `y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`
-//
-// The values must include 0. There can be no duplicate values or negative values.
-//
-// For example:
-//
-// ```
-// # tensor `x` is [3, 4, 0, 2, 1]
-// invert_permutation(x) ==> [2, 4, 3, 0, 1]
-// ```
-//
-// Arguments:
-//	x: 1-D.
-//
-// Returns 1-D.
-func InvertPermutation(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InvertPermutation",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a tensor filled with a scalar value.
-//
-// This operation creates a tensor of shape `dims` and fills it with `value`.
-//
-// For example:
-//
-// ```
-// # Output tensor has shape [2, 3].
-// fill([2, 3], 9) ==> [[9, 9, 9]
-//                      [9, 9, 9]]
-// ```
-//
-// `tf.fill` differs from `tf.constant` in a few ways:
-//
-// *   `tf.fill` only supports scalar contents, whereas `tf.constant` supports
-//     Tensor values.
-// *   `tf.fill` creates an Op in the computation graph that constructs the actual
-//     Tensor value at runtime. This is in contrast to `tf.constant` which embeds
-//     the entire Tensor into the graph with a `Const` node.
-// *   Because `tf.fill` evaluates at graph runtime, it supports dynamic shapes
-//     based on other runtime Tensors, unlike `tf.constant`.
-//
-// Arguments:
-//	dims: 1-D. Represents the shape of the output tensor.
-//	value: 0-D (scalar). Value to fill the returned tensor.
-//
-// @compatibility(numpy)
-// Equivalent to np.full
-// @end_compatibility
-func Fill(scope *Scope, dims tf.Output, value tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Fill",
-		Input: []tf.Input{
-			dims, value,
+			multi_device_iterator,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -7963,56 +8268,88 @@ func ModelDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataT
 	return op.Output(0)
 }
 
-// Makes its input available to the next iteration.
-//
-// Arguments:
-//	data: The tensor to be made available to the next iteration.
-//
-// Returns The same tensor as `data`.
-func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
+// Creates an Optional variant with no value.
+func OptionalNone(scope *Scope) (optional tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "NextIteration",
-		Input: []tf.Input{
-			data,
-		},
+		Type: "OptionalNone",
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Gets the next output from the given iterator as an Optional variant.
-func IteratorGetNextAsOptional(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (optional tf.Output) {
+// OrderedMapPeekAttr is an optional argument to OrderedMapPeek.
+type OrderedMapPeekAttr func(optionalAttr)
+
+// OrderedMapPeekCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapPeekCapacity(value int64) OrderedMapPeekAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapPeekMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapPeekMemoryLimit(value int64) OrderedMapPeekAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapPeekContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapPeekContainer(value string) OrderedMapPeekAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapPeekSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapPeekSharedName(value string) OrderedMapPeekAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op peeks at the values at the specified key.  If the
+//
+// underlying container does not contain this key
+// this op will block until it does.   This Op is optimized for
+// performance.
+func OrderedMapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapPeekAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IteratorGetNextAsOptional",
+		Type: "OrderedMapPeek",
 		Input: []tf.Input{
-			iterator,
+			key, indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns true if and only if the given Optional variant has a value.
-func OptionalHasValue(scope *Scope, optional tf.Output) (has_value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "OptionalHasValue",
-		Input: []tf.Input{
-			optional,
-		},
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("OrderedMapPeek", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return values
 }
 
 // OptimizeDatasetAttr is an optional argument to OptimizeDataset.
@@ -8054,6 +8391,138 @@ func OptimizeDataset(scope *Scope, input_dataset tf.Output, optimizations tf.Out
 	return op.Output(0)
 }
 
+// Returns a serialized GraphDef representing `input_dataset`.
+//
+// Returns a graph representation for `input_dataset`.
+//
+// Arguments:
+//	input_dataset: A variant tensor representing the dataset to return the graph representation for.
+//
+// Returns The graph representation of the dataset (as serialized GraphDef).
+func DatasetToGraph(scope *Scope, input_dataset tf.Output) (graph tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DatasetToGraph",
+		Input: []tf.Input{
+			input_dataset,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Converts the given variant tensor to an iterator and stores it in the given resource.
+//
+// Arguments:
+//	resource_handle: A handle to an iterator resource.
+//	serialized: A variant tensor storing the state of the iterator contained in the
+// resource.
+//
+// Returns the created operation.
+func DeserializeIterator(scope *Scope, resource_handle tf.Output, serialized tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DeserializeIterator",
+		Input: []tf.Input{
+			resource_handle, serialized,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Receives a tensor value broadcast from another device.
+func CollectiveBcastRecv(scope *Scope, T tf.DataType, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"T": T, "group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
+	opspec := tf.OpSpec{
+		Type: "CollectiveBcastRecv",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// IteratorFromStringHandleAttr is an optional argument to IteratorFromStringHandle.
+type IteratorFromStringHandleAttr func(optionalAttr)
+
+// IteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
+//
+// value: If specified, defines the type of each tuple component in an
+// element produced by the resulting iterator.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromStringHandleAttr {
+	return func(m optionalAttr) {
+		m["output_types"] = value
+	}
+}
+
+// IteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
+//
+// value: If specified, defines the shape of each tuple component in an
+// element produced by the resulting iterator.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func IteratorFromStringHandleOutputShapes(value []tf.Shape) IteratorFromStringHandleAttr {
+	return func(m optionalAttr) {
+		m["output_shapes"] = value
+	}
+}
+
+// Converts the given string representing a handle to an iterator to a resource.
+//
+// Arguments:
+//	string_handle: A string representation of the given handle.
+//
+// Returns A handle to an iterator resource.
+func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...IteratorFromStringHandleAttr) (resource_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "IteratorFromStringHandle",
+		Input: []tf.Input{
+			string_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Converts the given `resource_handle` representing an iterator to a string.
+//
+// Arguments:
+//	resource_handle: A handle to an iterator resource.
+//
+// Returns A string representation of the given handle.
+func IteratorToStringHandle(scope *Scope, resource_handle tf.Output) (string_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IteratorToStringHandle",
+		Input: []tf.Input{
+			resource_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Outputs the single element from the given dataset.
 //
 // Arguments:
@@ -8087,6 +8556,48 @@ func DatasetToSingleElement(scope *Scope, dataset tf.Output, output_types []tf.D
 	return components
 }
 
+// Computes offsets of concat inputs within its output.
+//
+// For example:
+//
+// ```
+// # 'x' is [2, 2, 7]
+// # 'y' is [2, 3, 7]
+// # 'z' is [2, 5, 7]
+// concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
+// ```
+//
+// This is typically used by gradient computations for a concat operation.
+//
+// Arguments:
+//	concat_dim: The dimension along which to concatenate.
+//	shape: The `N` int32 vectors representing shape of tensors being concatenated.
+//
+// Returns The `N` int32 vectors representing the starting offset
+// of input tensors within the concatenated output.
+func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ConcatOffset",
+		Input: []tf.Input{
+			concat_dim, tf.OutputList(shape),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if offset, idx, err = makeOutputList(op, idx, "offset"); err != nil {
+		scope.UpdateErr("ConcatOffset", err)
+		return
+	}
+	return offset
+}
+
 // Gets the next output from the given iterator.
 //
 // This operation is a synchronous version IteratorGetNext. It should only be used
@@ -8118,68 +8629,63 @@ func IteratorGetNextSync(scope *Scope, iterator tf.Output, output_types []tf.Dat
 	return components
 }
 
-// Returns a batched diagonal tensor with a given batched diagonal values.
-//
-// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-// everything else padded with zeros. The diagonal is computed as follows:
-//
-// Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
-// tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
-//
-// `output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
-//
-// For example:
-//
-// ```
-// # 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
-//
-// and diagonal.shape = (2, 4)
-//
-// tf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
-//                                      [0, 2, 0, 0]
-//                                      [0, 0, 3, 0]
-//                                      [0, 0, 0, 4]],
-//                                     [[5, 0, 0, 0]
-//                                      [0, 6, 0, 0]
-//                                      [0, 0, 7, 0]
-//                                      [0, 0, 0, 8]]]
-//
-// which has shape (2, 4, 4)
-// ```
-//
-// Arguments:
-//	diagonal: Rank `k`, where `k >= 1`.
-//
-// Returns Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
-func MatrixDiag(scope *Scope, diagonal tf.Output) (output tf.Output) {
+// Broadcasts a tensor value to one or more other devices.
+func CollectiveBcastSend(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
 	opspec := tf.OpSpec{
-		Type: "MatrixDiag",
+		Type: "CollectiveBcastSend",
 		Input: []tf.Input{
-			diagonal,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Makes a new iterator from the given `dataset` and stores it in `iterator`.
+// QueueCloseV2Attr is an optional argument to QueueCloseV2.
+type QueueCloseV2Attr func(optionalAttr)
+
+// QueueCloseV2CancelPendingEnqueues sets the optional cancel_pending_enqueues attribute to value.
 //
-// This operation may be executed multiple times. Each execution will reset the
-// iterator in `iterator` to the first element of `dataset`.
+// value: If true, all pending enqueue requests that are
+// blocked on the given queue will be canceled.
+// If not specified, defaults to false
+func QueueCloseV2CancelPendingEnqueues(value bool) QueueCloseV2Attr {
+	return func(m optionalAttr) {
+		m["cancel_pending_enqueues"] = value
+	}
+}
+
+// Closes the given queue.
+//
+// This operation signals that no more elements will be enqueued in the
+// given queue. Subsequent Enqueue(Many) operations will fail.
+// Subsequent Dequeue(Many) operations will continue to succeed if
+// sufficient elements remain in the queue. Subsequent Dequeue(Many)
+// operations that would block will fail immediately.
+//
+// Arguments:
+//	handle: The handle to a queue.
 //
 // Returns the created operation.
-func MakeIterator(scope *Scope, dataset tf.Output, iterator tf.Output) (o *tf.Operation) {
+func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "MakeIterator",
+		Type: "QueueCloseV2",
 		Input: []tf.Input{
-			dataset, iterator,
+			handle,
 		},
+		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
@@ -8204,55 +8710,6 @@ func DeleteIterator(scope *Scope, handle tf.Output, deleter tf.Output) (o *tf.Op
 	return scope.AddOperation(opspec)
 }
 
-// Returns the rank of a tensor.
-//
-// This operation returns an integer representing the rank of `input`.
-//
-// For example:
-//
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// # shape of tensor 't' is [2, 2, 3]
-// rank(t) ==> 3
-// ```
-//
-// **Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
-// of a tensor is the number of indices required to uniquely select each element
-// of the tensor. Rank is also known as "order", "degree", or "ndims."
-func Rank(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Rank",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// A container for an iterator resource.
-//
-// Returns A handle to the iterator that can be passed to a "MakeIterator" or
-// "IteratorGetNext" op. In contrast to Iterator, AnonymousIterator prevents
-// resource sharing by name, and does not keep a reference to the resource
-// container.A variant deleter that should be passed into the op that deletes the iterator.
-func AnonymousIteratorV2(scope *Scope, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output, deleter tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "AnonymousIteratorV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
 // A container for an iterator resource.
 //
 // Returns A handle to the iterator that can be passed to a "MakeIterator" or
@@ -8273,84 +8730,39 @@ func AnonymousIterator(scope *Scope, output_types []tf.DataType, output_shapes [
 	return op.Output(0)
 }
 
-// Copy a tensor setting everything outside a central band in each innermost matrix
-//
-// to zero.
-//
-// The `band` part is computed as follows:
-// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-// tensor with the same shape where
-//
-// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
-//
-// The indicator function
-//
-// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
-//                  (num_upper < 0 || (n-m) <= num_upper)`.
-//
-// For example:
-//
-// ```
-// # if 'input' is [[ 0,  1,  2, 3]
-//                  [-1,  0,  1, 2]
-//                  [-2, -1,  0, 1]
-//                  [-3, -2, -1, 0]],
-//
-// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
-//                                        [-1,  0,  1, 2]
-//                                        [ 0, -1,  0, 1]
-//                                        [ 0,  0, -1, 0]],
-//
-// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
-//                                       [-1,  0,  1, 0]
-//                                       [-2, -1,  0, 1]
-//                                       [ 0, -2, -1, 0]]
-// ```
-//
-// Useful special cases:
-//
-// ```
-//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
-//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
-//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
-// ```
-//
-// Arguments:
-//	input: Rank `k` tensor.
-//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
-// lower triangle.
-//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
-// entire upper triangle.
-//
-// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
-func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
+// Returns true if and only if the given Optional variant has a value.
+func OptionalHasValue(scope *Scope, optional tf.Output) (has_value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixBandPart",
+		Type: "OptionalHasValue",
 		Input: []tf.Input{
-			input, num_lower, num_upper,
+			optional,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns a constant tensor on the host. Only for writing C++ tests.
+// Creates a dataset that emits the records from one or more TFRecord files.
 //
 // Arguments:
-//	value: Attr `value` is the tensor to return.
-//
-func HostConst(scope *Scope, value tf.Tensor, dtype tf.DataType) (output tf.Output) {
+//	filenames: A scalar or vector containing the name(s) of the file(s) to be
+// read.
+//	compression_type: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+//	buffer_size: A scalar representing the number of bytes to buffer. A value of
+// 0 means no buffering will be performed.
+func TFRecordDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"value": value, "dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "HostConst",
-
-		Attrs: attrs,
+		Type: "TFRecordDataset",
+		Input: []tf.Input{
+			filenames, compression_type, buffer_size,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -8410,158 +8822,6 @@ func CacheDataset(scope *Scope, input_dataset tf.Output, filename tf.Output, out
 	return op.Output(0)
 }
 
-// SpaceToDepthAttr is an optional argument to SpaceToDepth.
-type SpaceToDepthAttr func(optionalAttr)
-
-// SpaceToDepthDataFormat sets the optional data_format attribute to value.
-// If not specified, defaults to "NHWC"
-func SpaceToDepthDataFormat(value string) SpaceToDepthAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// SpaceToDepth for tensors of type T.
-//
-// Rearranges blocks of spatial data, into depth. More specifically,
-// this op outputs a copy of the input tensor where values from the `height`
-// and `width` dimensions are moved to the `depth` dimension.
-// The attr `block_size` indicates the input block size.
-//
-//   * Non-overlapping blocks of size `block_size x block size` are rearranged
-//     into depth at each location.
-//   * The depth of the output tensor is `block_size * block_size * input_depth`.
-//   * The Y, X coordinates within each block of the input become the high order
-//     component of the output channel index.
-//   * The input tensor's height and width must be divisible by block_size.
-//
-// The `data_format` attr specifies the layout of the input and output tensors
-// with the following options:
-//   "NHWC": `[ batch, height, width, channels ]`
-//   "NCHW": `[ batch, channels, height, width ]`
-//   "NCHW_VECT_C":
-//       `qint8 [ batch, channels / 4, height, width, 4 ]`
-//
-// It is useful to consider the operation as transforming a 6-D Tensor.
-// e.g. for data_format = NHWC,
-//      Each element in the input tensor can be specified via 6 coordinates,
-//      ordered by decreasing memory layout significance as:
-//      n,oY,bY,oX,bX,iC  (where n=batch index, oX, oY means X or Y coordinates
-//                         within the output image, bX, bY means coordinates
-//                         within the input block, iC means input channels).
-//      The output would be a transpose to the following layout:
-//      n,oY,oX,bY,bX,iC
-//
-// This operation is useful for resizing the activations between convolutions
-// (but keeping all data), e.g. instead of pooling. It is also useful for training
-// purely convolutional models.
-//
-// For example, given an input of shape `[1, 2, 2, 1]`, data_format = "NHWC" and
-// block_size = 2:
-//
-// ```
-// x = [[[[1], [2]],
-//       [[3], [4]]]]
-// ```
-//
-// This operation will output a tensor of shape `[1, 1, 1, 4]`:
-//
-// ```
-// [[[[1, 2, 3, 4]]]]
-// ```
-//
-// Here, the input has a batch of 1 and each batch element has shape `[2, 2, 1]`,
-// the corresponding output will have a single element (i.e. width and height are
-// both 1) and will have a depth of 4 channels (1 * block_size * block_size).
-// The output element shape is `[1, 1, 4]`.
-//
-// For an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.
-//
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
-//
-// This operation, for block_size of 2, will return the following tensor of shape
-// `[1, 1, 1, 12]`
-//
-// ```
-// [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
-// ```
-//
-// Similarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:
-//
-// ```
-// x = [[[[1],   [2],  [5],  [6]],
-//       [[3],   [4],  [7],  [8]],
-//       [[9],  [10], [13],  [14]],
-//       [[11], [12], [15],  [16]]]]
-// ```
-//
-// the operator will return the following tensor of shape `[1 2 2 4]`:
-//
-// ```
-// x = [[[[1, 2, 3, 4],
-//        [5, 6, 7, 8]],
-//       [[9, 10, 11, 12],
-//        [13, 14, 15, 16]]]]
-// ```
-//
-// Arguments:
-//
-//	block_size: The size of the spatial block.
-func SpaceToDepth(scope *Scope, input tf.Output, block_size int64, optional ...SpaceToDepthAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"block_size": block_size}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SpaceToDepth",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that shuffles and repeats elements from `input_dataset`
-//
-// pseudorandomly.
-//
-// Arguments:
-//
-//	buffer_size: The number of output elements to buffer in an iterator over
-// this dataset. Compare with the `min_after_dequeue` attr when creating a
-// `RandomShuffleQueue`.
-//	seed: A scalar seed for the random number generator. If either `seed` or
-// `seed2` is set to be non-zero, the random number generator is seeded
-// by the given seed.  Otherwise, a random seed is used.
-//	seed2: A second scalar seed to avoid seed collision.
-//	count: A scalar representing the number of times the underlying dataset
-// should be repeated. The default is `-1`, which results in infinite repetition.
-//
-//
-func ShuffleAndRepeatDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ShuffleAndRepeatDataset",
-		Input: []tf.Input{
-			input_dataset, buffer_size, seed, seed2, count,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // FakeQuantWithMinMaxVarsPerChannelAttr is an optional argument to FakeQuantWithMinMaxVarsPerChannel.
 type FakeQuantWithMinMaxVarsPerChannelAttr func(optionalAttr)
 
@@ -8671,6 +8931,30 @@ func ShuffleDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output
 	return op.Output(0)
 }
 
+// Creates a dataset with a range of values. Corresponds to python's xrange.
+//
+// Arguments:
+//	start: corresponds to start in python's xrange().
+//	stop: corresponds to stop in python's xrange().
+//	step: corresponds to step in python's xrange().
+//
+//
+func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "RangeDataset",
+		Input: []tf.Input{
+			start, stop, step,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // PaddedBatchDatasetV2Attr is an optional argument to PaddedBatchDatasetV2.
 type PaddedBatchDatasetV2Attr func(optionalAttr)
 
@@ -8716,26 +9000,56 @@ func PaddedBatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.O
 	return op.Output(0)
 }
 
-// ShardDatasetAttr is an optional argument to ShardDataset.
-type ShardDatasetAttr func(optionalAttr)
-
-// ShardDatasetRequireNonEmpty sets the optional require_non_empty attribute to value.
-// If not specified, defaults to false
-func ShardDatasetRequireNonEmpty(value bool) ShardDatasetAttr {
-	return func(m optionalAttr) {
-		m["require_non_empty"] = value
-	}
-}
-
-// Creates a `Dataset` that includes only 1/`num_shards` of this dataset.
+// Creates a dataset that batches and pads `batch_size` elements from the input.
 //
 // Arguments:
 //
-//	num_shards: An integer representing the number of shards operating in parallel.
-//	index: An integer representing the current worker index.
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	padded_shapes: A list of int64 tensors representing the desired padded shapes
+// of the corresponding output components. These shapes may be partially
+// specified, using `-1` to indicate that a particular dimension should be
+// padded to the maximum size of all batch elements.
+//	padding_values: A list of scalars containing the padding value to use for
+// each of the outputs.
+//
+func PaddedBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "PaddedBatchDataset",
+		Input: []tf.Input{
+			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// BatchDatasetV2Attr is an optional argument to BatchDatasetV2.
+type BatchDatasetV2Attr func(optionalAttr)
+
+// BatchDatasetV2ParallelCopy sets the optional parallel_copy attribute to value.
+// If not specified, defaults to false
+func BatchDatasetV2ParallelCopy(value bool) BatchDatasetV2Attr {
+	return func(m optionalAttr) {
+		m["parallel_copy"] = value
+	}
+}
+
+// Creates a dataset that batches `batch_size` elements from `input_dataset`.
+//
+// Arguments:
+//
+//	batch_size: A scalar representing the number of elements to accumulate in a batch.
+//	drop_remainder: A scalar representing whether the last batch should be dropped in case its size
+// is smaller than desired.
 //
 //
-func ShardDataset(scope *Scope, input_dataset tf.Output, num_shards tf.Output, index tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ShardDatasetAttr) (handle tf.Output) {
+func BatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...BatchDatasetV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8744,9 +9058,9 @@ func ShardDataset(scope *Scope, input_dataset tf.Output, num_shards tf.Output, i
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ShardDataset",
+		Type: "BatchDatasetV2",
 		Input: []tf.Input{
-			input_dataset, num_shards, index,
+			input_dataset, batch_size, drop_remainder,
 		},
 		Attrs: attrs,
 	}
@@ -8778,123 +9092,223 @@ func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, o
 	return op.Output(0)
 }
 
-// A dataset that creates window datasets from the input dataset.
-//
-// Arguments:
-//
-//	size: A scalar representing the number of elements to accumulate in a window.
-//	shift: A scalar representing the steps moving the sliding window forward in one
-// iteration. It must be positive.
-//	stride: A scalar representing the stride of the input elements of the sliding window.
-// It must be positive.
-//	drop_remainder: A scalar representing whether a window should be dropped in case its size is
-// smaller than desired.
-//
-//
-func WindowDataset(scope *Scope, input_dataset tf.Output, size tf.Output, shift tf.Output, stride tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Connects outputs of an N-way replicated computation to N outputs.
+func TPUReplicatedOutput(scope *Scope, input tf.Output, num_replicas int64) (outputs []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"num_replicas": num_replicas}
 	opspec := tf.OpSpec{
-		Type: "WindowDataset",
+		Type: "TPUReplicatedOutput",
 		Input: []tf.Input{
-			input_dataset, size, shift, stride, drop_remainder,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Quantized Batch normalization.
-//
-// This op is deprecated and will be removed in the future. Prefer
-// `tf.nn.batch_normalization`.
-//
-// Arguments:
-//	t: A 4D input Tensor.
-//	t_min: The value represented by the lowest quantized input.
-//	t_max: The value represented by the highest quantized input.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	m_min: The value represented by the lowest quantized mean.
-//	m_max: The value represented by the highest quantized mean.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v_min: The value represented by the lowest quantized variance.
-//	v_max: The value represented by the highest quantized variance.
-//	beta: A 1D beta Tensor with size matching the last dimension of t.
-// An offset to be added to the normalized tensor.
-//	beta_min: The value represented by the lowest quantized offset.
-//	beta_max: The value represented by the highest quantized offset.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this tensor will be multiplied
-// with the normalized tensor.
-//	gamma_min: The value represented by the lowest quantized gamma.
-//	gamma_max: The value represented by the highest quantized gamma.
-//
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-func QuantizedBatchNormWithGlobalNormalization(scope *Scope, t tf.Output, t_min tf.Output, t_max tf.Output, m tf.Output, m_min tf.Output, m_max tf.Output, v tf.Output, v_min tf.Output, v_max tf.Output, beta tf.Output, beta_min tf.Output, beta_max tf.Output, gamma tf.Output, gamma_min tf.Output, gamma_max tf.Output, out_type tf.DataType, variance_epsilon float32, scale_after_normalization bool) (result tf.Output, result_min tf.Output, result_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type, "variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
-	opspec := tf.OpSpec{
-		Type: "QuantizedBatchNormWithGlobalNormalization",
-		Input: []tf.Input{
-			t, t_min, t_max, m, m_min, m_max, v, v_min, v_max, beta, beta_min, beta_max, gamma, gamma_min, gamma_max,
-		},
-		Attrs: attrs,
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("TPUReplicatedOutput", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return outputs
 }
 
-// Returns a tensor of ones with the same shape and type as x.
+// Adds sparse `updates` to an existing tensor according to `indices`.
+//
+// This operation creates a new tensor by adding sparse `updates` to the passed
+// in `tensor`.
+// This operation is very similar to `tf.scatter_nd_add`, except that the updates
+// are added onto an existing tensor (as opposed to a variable). If the memory
+// for the existing tensor cannot be re-used, a copy is made and updated.
+//
+// `indices` is an integer tensor containing indices into a new tensor of shape
+// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+//
+//     indices.shape[-1] <= shape.rank
+//
+// The last dimension of `indices` corresponds to indices into elements
+// (if `indices.shape[-1] = shape.rank`) or slices
+// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+// `shape`.  `updates` is a tensor with shape
+//
+//     indices.shape[:-1] + shape[indices.shape[-1]:]
+//
+// The simplest form of tensor_scatter_add is to add individual elements to a
+// tensor by index. For example, say we want to add 4 elements in a rank-1
+// tensor with 8 elements.
+//
+// In Python, this scatter add operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     tensor = tf.ones([8], dtype=tf.int32)
+//     updated = tf.tensor_scatter_add(tensor, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [1, 12, 1, 11, 10, 1, 1, 13]
+//
+// We can also, insert entire slices of a higher rank tensor all at once. For
+// example, if we wanted to insert two slices in the first dimension of a
+// rank-3 tensor with two matrices of new values.
+//
+// In Python, this scatter add operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[0], [2]])
+//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]],
+//                            [[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
+//     tensor = tf.ones([4, 4, 4])
+//     updated = tf.tensor_scatter_add(tensor, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [[[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
+//      [[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, the index is ignored.
 //
 // Arguments:
-//	x: a tensor of type T.
+//	tensor: Tensor to copy/update.
+//	indices: Index tensor.
+//	updates: Updates to scatter into output.
 //
-// Returns a tensor of the same shape and type as x but filled with ones.
-func OnesLike(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns A new tensor copied from tensor and updates added according to the indices.
+func TensorScatterAdd(scope *Scope, tensor tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "OnesLike",
+		Type: "TensorScatterAdd",
 		Input: []tf.Input{
-			x,
+			tensor, indices, updates,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
-type QuantizedRelu6Attr func(optionalAttr)
+// HashTableV2Attr is an optional argument to HashTableV2.
+type HashTableV2Attr func(optionalAttr)
 
-// QuantizedRelu6OutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
+// HashTableV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func HashTableV2Container(value string) HashTableV2Attr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["container"] = value
 	}
 }
 
-// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
+// HashTableV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func HashTableV2SharedName(value string) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// HashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+//
+// value: If true and shared_name is empty, the table is shared
+// using the node name.
+// If not specified, defaults to false
+func HashTableV2UseNodeNameSharing(value bool) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// Creates a non-initialized hash table.
+//
+// This op creates a hash table, specifying the type of its keys and values.
+// Before using the table you will have to initialize it.  After initialization the
+// table will be immutable.
 //
 // Arguments:
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
 //
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
+// Returns Handle to a table.
+func HashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...HashTableV2Attr) (table_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "HashTableV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
+type FractionalAvgPoolGradAttr func(optionalAttr)
+
+// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
 //
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [41/3, 26/3] for fractional avg pooling.
+// If not specified, defaults to false
+func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// Computes gradient of the FractionalAvgPool function.
+//
+// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
+// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
+// out_backprop to those indices that form the same pooling cell. Therefore, we
+// just need to know the shape of original input tensor, instead of the whole
+// tensor.
+//
+// Arguments:
+//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_avg_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
+//
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
+func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8903,82 +9317,14 @@ func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, ma
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedRelu6",
+		Type: "FractionalAvgPoolGrad",
 		Input: []tf.Input{
-			features, min_features, max_features,
+			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Adds Tensor 'bias' to Tensor 'input' for Quantized types.
-//
-// Broadcasts the values of bias on dimensions 0..N-2 of 'input'.
-//
-// Arguments:
-//
-//	bias: A 1D bias Tensor with size matching the last dimension of 'input'.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_bias: The float value that the lowest quantized bias value represents.
-//	max_bias: The float value that the highest quantized bias value represents.
-//
-//
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedBiasAdd(scope *Scope, input tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_bias tf.Output, max_bias tf.Output, out_type tf.DataType) (output tf.Output, min_out tf.Output, max_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"out_type": out_type}
-	opspec := tf.OpSpec{
-		Type: "QuantizedBiasAdd",
-		Input: []tf.Input{
-			input, bias, min_input, max_input, min_bias, max_bias,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// QuantizedReluAttr is an optional argument to QuantizedRelu.
-type QuantizedReluAttr func(optionalAttr)
-
-// QuantizedReluOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluOutType(value tf.DataType) QuantizedReluAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Computes Quantized Rectified Linear: `max(features, 0)`
-//
-// Arguments:
-//
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
-//
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedRelu",
-		Input: []tf.Input{
-			features, min_features, max_features,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
 // FractionalMaxPoolAttr is an optional argument to FractionalMaxPool.
@@ -9109,40 +9455,165 @@ func FractionalMaxPool(scope *Scope, value tf.Output, pooling_ratio []float32, o
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// TopKV2Attr is an optional argument to TopKV2.
-type TopKV2Attr func(optionalAttr)
-
-// TopKV2Sorted sets the optional sorted attribute to value.
+// Computes softmax cross entropy cost and gradients to backpropagate.
 //
-// value: If true the resulting `k` elements will be sorted by the values in
-// descending order.
-// If not specified, defaults to true
-func TopKV2Sorted(value bool) TopKV2Attr {
+// Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
+// a matrix of label probabilities, but rather a single label per row
+// of features.  This label is considered to have probability 1.0 for the
+// given row.
+//
+// Inputs are the logits, not probabilities.
+//
+// Arguments:
+//	features: batch_size x num_classes matrix
+//	labels: batch_size vector with values in [0, num_classes).
+// This is the label for the given minibatch entry.
+//
+// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
+func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSoftmaxCrossEntropyWithLogits",
+		Input: []tf.Input{
+			features, labels,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Computes softmax cross entropy cost and gradients to backpropagate.
+//
+// Inputs are the logits, not probabilities.
+//
+// Arguments:
+//	features: batch_size x num_classes matrix
+//	labels: batch_size x num_classes matrix
+// The caller must ensure that each batch of labels represents a valid
+// probability distribution.
+//
+// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
+func SoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SoftmaxCrossEntropyWithLogits",
+		Input: []tf.Input{
+			features, labels,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Computes softplus gradients for a softplus operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding softplus operation.
+//	features: The features passed as input to the corresponding softplus operation.
+//
+// Returns The gradients: `gradients / (1 + exp(-features))`.
+func SoftplusGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SoftplusGrad",
+		Input: []tf.Input{
+			gradients, features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Updates the table to associates keys with values.
+//
+// The tensor `keys` must be of the same type as the keys of the table.
+// The tensor `values` must be of the type of the table values.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//	values: Values to associate with keys.
+//
+// Returns the created operation.
+func LookupTableInsertV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LookupTableInsertV2",
+		Input: []tf.Input{
+			table_handle, keys, values,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes gradients for the exponential linear (Elu) operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Elu operation.
+//	outputs: The outputs of the corresponding Elu operation.
+//
+// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
+// `gradients` otherwise.
+func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "EluGrad",
+		Input: []tf.Input{
+			gradients, outputs,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of x OR y element-wise.
+//
+// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogicalOr",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LeakyReluGradAttr is an optional argument to LeakyReluGrad.
+type LeakyReluGradAttr func(optionalAttr)
+
+// LeakyReluGradAlpha sets the optional alpha attribute to value.
+// If not specified, defaults to 0.2
+func LeakyReluGradAlpha(value float32) LeakyReluGradAttr {
 	return func(m optionalAttr) {
-		m["sorted"] = value
+		m["alpha"] = value
 	}
 }
 
-// Finds values and indices of the `k` largest elements for the last dimension.
-//
-// If the input is a vector (rank-1), finds the `k` largest entries in the vector
-// and outputs their values and indices as vectors.  Thus `values[j]` is the
-// `j`-th largest entry in `input`, and its index is `indices[j]`.
-//
-// For matrices (resp. higher rank input), computes the top `k` entries in each
-// row (resp. vector along the last dimension).  Thus,
-//
-//     values.shape = indices.shape = input.shape[:-1] + [k]
-//
-// If two elements are equal, the lower-index element appears first.
+// Computes rectified linear gradients for a LeakyRelu operation.
 //
 // Arguments:
-//	input: 1-D or higher with last dimension at least `k`.
-//	k: 0-D.  Number of top elements to look for along the last dimension (along each
-// row for matrices).
+//	gradients: The backpropagated gradients to the corresponding LeakyRelu operation.
+//	features: The features passed as input to the corresponding LeakyRelu operation,
+// OR the outputs of that operation (both work equivalently).
 //
-// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
-func TopKV2(scope *Scope, input tf.Output, k tf.Output, optional ...TopKV2Attr) (values tf.Output, indices tf.Output) {
+// Returns `gradients * (features > 0) + alpha * gradients * (featurs <= 0)`.
+func LeakyReluGrad(scope *Scope, gradients tf.Output, features tf.Output, optional ...LeakyReluGradAttr) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9151,58 +9622,1463 @@ func TopKV2(scope *Scope, input tf.Output, k tf.Output, optional ...TopKV2Attr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TopKV2",
+		Type: "LeakyReluGrad",
 		Input: []tf.Input{
-			input, k,
+			gradients, features,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayGradV3
+// Computes the gradient of morphological 2-D dilation with respect to the filter.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayWriteV3
-func TensorArrayWriteV2(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
+//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
+// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
+// Must be: `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 3-D with shape `[filter_height, filter_width, depth]`.
+func Dilation2DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (filter_backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "Dilation2DBackpropFilter",
+		Input: []tf.Input{
+			input, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DecodePaddedRawAttr is an optional argument to DecodePaddedRaw.
+type DecodePaddedRawAttr func(optionalAttr)
+
+// DecodePaddedRawLittleEndian sets the optional little_endian attribute to value.
+//
+// value: Whether the input `input_bytes` is in little-endian order. Ignored for
+// `out_type` values that are stored in a single byte, like `uint8`
+// If not specified, defaults to true
+func DecodePaddedRawLittleEndian(value bool) DecodePaddedRawAttr {
+	return func(m optionalAttr) {
+		m["little_endian"] = value
+	}
+}
+
+// Reinterpret the bytes of a string as a vector of numbers.
+//
+// Arguments:
+//	input_bytes: Tensor of string to be decoded.
+//	fixed_length: Length in bytes for each element of the decoded output. Must be a multiple
+// of the size of the output type.
+//
+//
+// Returns A Tensor with one more dimension than the input `bytes`. The added dimension
+// will have size equal to the length of the elements of `bytes` divided by the
+// number of bytes to represent `out_type`.
+func DecodePaddedRaw(scope *Scope, input_bytes tf.Output, fixed_length tf.Output, out_type tf.DataType, optional ...DecodePaddedRawAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodePaddedRaw",
+		Input: []tf.Input{
+			input_bytes, fixed_length,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Restores tensors from a V2 checkpoint.
+//
+// For backward compatibility with the V1 format, this Op currently allows
+// restoring from a V1 checkpoint as well:
+//   - This Op first attempts to find the V2 index file pointed to by "prefix", and
+//     if found proceed to read it as a V2 checkpoint;
+//   - Otherwise the V1 read path is invoked.
+// Relying on this behavior is not recommended, as the ability to fall back to read
+// V1 might be deprecated and eventually removed.
+//
+// By default, restores the named tensors in full.  If the caller wishes to restore
+// specific slices of stored tensors, "shape_and_slices" should be non-empty
+// strings and correspondingly well-formed.
+//
+// Callers must ensure all the named tensors are indeed stored in the checkpoint.
+//
+// Arguments:
+//	prefix: Must have a single element.  The prefix of a V2 checkpoint.
+//	tensor_names: shape {N}.  The names of the tensors to be restored.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be restored.
+// Empty strings indicate that they are non-partitioned tensors.
+//	dtypes: shape {N}.  The list of expected dtype for the tensors.  Must match
+// those stored in the checkpoint.
+//
+// Returns shape {N}.  The restored tensors, whose shapes are read from the
+// checkpoint directly.
+func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, dtypes []tf.DataType) (tensors []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	opspec := tf.OpSpec{
+		Type: "RestoreV2",
+		Input: []tf.Input{
+			prefix, tensor_names, shape_and_slices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if tensors, idx, err = makeOutputList(op, idx, "tensors"); err != nil {
+		scope.UpdateErr("RestoreV2", err)
+		return
+	}
+	return tensors
+}
+
+// MaxPoolGradWithArgmaxAttr is an optional argument to MaxPoolGradWithArgmax.
+type MaxPoolGradWithArgmaxAttr func(optionalAttr)
+
+// MaxPoolGradWithArgmaxIncludeBatchInIndex sets the optional include_batch_in_index attribute to value.
+//
+// value: Whether to include batch dimension in flattened index of `argmax`.
+// If not specified, defaults to false
+func MaxPoolGradWithArgmaxIncludeBatchInIndex(value bool) MaxPoolGradWithArgmaxAttr {
+	return func(m optionalAttr) {
+		m["include_batch_in_index"] = value
+	}
+}
+
+// Computes gradients of the maxpooling function.
+//
+// Arguments:
+//	input: The original input.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+// output of `max_pool`.
+//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients w.r.t. the input of `max_pool`.
+func MaxPoolGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradWithArgmaxAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolGradWithArgmax",
+		Input: []tf.Input{
+			input, grad, argmax,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// PrelinearizeAttr is an optional argument to Prelinearize.
+type PrelinearizeAttr func(optionalAttr)
+
+// PrelinearizeShape sets the optional shape attribute to value.
+//
+// value: The shape of the tensor.
+// If not specified, defaults to <>
+func PrelinearizeShape(value tf.Shape) PrelinearizeAttr {
+	return func(m optionalAttr) {
+		m["shape"] = value
+	}
+}
+
+// PrelinearizeLayout sets the optional layout attribute to value.
+//
+// value: A vector holding the requested layout in minor-to-major sequence. If a layout
+// attribute is passed but its values are all -1 the layout will be computed by
+// the infeed operation.
+// If not specified, defaults to <>
+func PrelinearizeLayout(value []int64) PrelinearizeAttr {
+	return func(m optionalAttr) {
+		m["layout"] = value
+	}
+}
+
+// An op which linearizes one Tensor value to an opaque variant tensor.
+//
+// Arguments:
+//	input: A tensor that will be linearized.
+func Prelinearize(scope *Scope, input tf.Output, optional ...PrelinearizeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Prelinearize",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPoolV2Attr is an optional argument to MaxPoolV2.
+type MaxPoolV2Attr func(optionalAttr)
+
+// MaxPoolV2DataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs max pooling on the input.
+//
+// Arguments:
+//	input: 4-D input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The max pooled output tensor.
+func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolV2",
+		Input: []tf.Input{
+			input, ksize, strides,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LRNGradAttr is an optional argument to LRNGrad.
+type LRNGradAttr func(optionalAttr)
+
+// LRNGradDepthRadius sets the optional depth_radius attribute to value.
+//
+// value: A depth radius.
+// If not specified, defaults to 5
+func LRNGradDepthRadius(value int64) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["depth_radius"] = value
+	}
+}
+
+// LRNGradBias sets the optional bias attribute to value.
+//
+// value: An offset (usually > 0 to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNGradBias(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["bias"] = value
+	}
+}
+
+// LRNGradAlpha sets the optional alpha attribute to value.
+//
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNGradAlpha(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// LRNGradBeta sets the optional beta attribute to value.
+//
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNGradBeta(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["beta"] = value
+	}
+}
+
+// Gradients for Local Response Normalization.
+//
+// Arguments:
+//	input_grads: 4-D with shape `[batch, height, width, channels]`.
+//	input_image: 4-D with shape `[batch, height, width, channels]`.
+//	output_image: 4-D with shape `[batch, height, width, channels]`.
+//
+// Returns The gradients for LRN.
+func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LRNGrad",
+		Input: []tf.Input{
+			input_grads, input_image, output_image,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LRNAttr is an optional argument to LRN.
+type LRNAttr func(optionalAttr)
+
+// LRNDepthRadius sets the optional depth_radius attribute to value.
+//
+// value: 0-D.  Half-width of the 1-D normalization window.
+// If not specified, defaults to 5
+func LRNDepthRadius(value int64) LRNAttr {
+	return func(m optionalAttr) {
+		m["depth_radius"] = value
+	}
+}
+
+// LRNBias sets the optional bias attribute to value.
+//
+// value: An offset (usually positive to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNBias(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["bias"] = value
+	}
+}
+
+// LRNAlpha sets the optional alpha attribute to value.
+//
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNAlpha(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// LRNBeta sets the optional beta attribute to value.
+//
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNBeta(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["beta"] = value
+	}
+}
+
+// Local Response Normalization.
+//
+// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
+// dimension), and each vector is normalized independently.  Within a given vector,
+// each component is divided by the weighted, squared sum of inputs within
+// `depth_radius`.  In detail,
+//
+//     sqr_sum[a, b, c, d] =
+//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
+//     output = input / (bias + alpha * sqr_sum) ** beta
+//
+// For details, see [Krizhevsky et al., ImageNet classification with deep
+// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+//
+// Arguments:
+//	input: 4-D.
+func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LRN",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPool3DGradGradAttr is an optional argument to MaxPool3DGradGrad.
+type MaxPool3DGradGradAttr func(optionalAttr)
+
+// MaxPool3DGradGradDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradGradDataFormat(value string) MaxPool3DGradGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes second-order gradients of the maxpooling function.
+//
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPool3DGradGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Gradient op for `MirrorPad` op. This op folds a mirror-padded tensor.
+//
+// This operation folds the padded areas of `input` by `MirrorPad` according to the
+// `paddings` you specify. `paddings` must be the same as `paddings` argument
+// given to the corresponding `MirrorPad` op.
+//
+// The folded size of each dimension D of the output is:
+//
+// `input.dim_size(D) - paddings(D, 0) - paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].
+// # 'paddings' is [[0, 1]], [0, 1]].
+// # 'mode' is SYMMETRIC.
+// # rank of 't' is 2.
+// pad(t, paddings) ==> [[ 1,  5]
+//                       [11, 28]]
+// ```
+//
+// Arguments:
+//	input: The input tensor to be folded.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	mode: The mode used in the `MirrorPad` op.
+//
+// Returns The folded tensor.
+func MirrorPadGrad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"mode": mode}
+	opspec := tf.OpSpec{
+		Type: "MirrorPadGrad",
+		Input: []tf.Input{
+			input, paddings,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPool3DAttr is an optional argument to MaxPool3D.
+type MaxPool3DAttr func(optionalAttr)
+
+// MaxPool3DDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DDataFormat(value string) MaxPool3DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs 3D max pooling on the input.
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The max pooled output tensor.
+func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPool3D",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// PrintAttr is an optional argument to Print.
+type PrintAttr func(optionalAttr)
+
+// PrintMessage sets the optional message attribute to value.
+//
+// value: A string, prefix of the error message.
+// If not specified, defaults to ""
+func PrintMessage(value string) PrintAttr {
+	return func(m optionalAttr) {
+		m["message"] = value
+	}
+}
+
+// PrintFirstN sets the optional first_n attribute to value.
+//
+// value: Only log `first_n` number of times. -1 disables logging.
+// If not specified, defaults to -1
+func PrintFirstN(value int64) PrintAttr {
+	return func(m optionalAttr) {
+		m["first_n"] = value
+	}
+}
+
+// PrintSummarize sets the optional summarize attribute to value.
+//
+// value: Only print this many entries of each tensor.
+// If not specified, defaults to 3
+func PrintSummarize(value int64) PrintAttr {
+	return func(m optionalAttr) {
+		m["summarize"] = value
+	}
+}
+
+// Prints a list of tensors.
+//
+// Passes `input` through to `output` and prints `data` when evaluating.
+//
+// Arguments:
+//	input: The tensor passed to `output`
+//	data: A list of tensors to print out when op is evaluated.
+//
+// Returns = The unmodified `input` tensor
+func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Print",
+		Input: []tf.Input{
+			input, tf.OutputList(data),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Saves the input tensors to disk.
+//
+// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
+// is written to `filename` with name `tensor_names[i]`.
+//
+// See also `SaveSlices`.
+//
+// Arguments:
+//	filename: Must have a single element. The name of the file to which we write
+// the tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	data: `N` tensors to save.
+//
+// Returns the created operation.
+func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayWriteV2",
+		Type: "Save",
 		Input: []tf.Input{
-			handle, index, value, flow_in,
+			filename, tensor_names, tf.OutputList(data),
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// A dataset that splits the elements of its input into multiple elements.
+func ExperimentalUnbatchDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalUnbatchDataset",
+		Input: []tf.Input{
+			input_dataset,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes log softmax activations.
+//
+// For each batch `i` and class `j` we have
+//
+//     logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
+//
+// Arguments:
+//	logits: 2-D with shape `[batch_size, num_classes]`.
+//
+// Returns Same shape as `logits`.
+func LogSoftmax(scope *Scope, logits tf.Output) (logsoftmax tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogSoftmax",
+		Input: []tf.Input{
+			logits,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-//   This op is used as a placeholder in If branch functions. It doesn't provide a
-//   valid output when run, so must either be removed (e.g. replaced with a
-//   function input) or guaranteed not to be used (e.g. if mirroring an
-//   intermediate output needed for the gradient computation of the other branch).
+// DepthwiseConv2dNativeBackpropInputAttr is an optional argument to DepthwiseConv2dNativeBackpropInput.
+type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
+
+// DepthwiseConv2dNativeBackpropInputDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dNativeBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// DepthwiseConv2dNativeBackpropInputDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of depthwise convolution with respect to the input.
 //
 // Arguments:
-//	dtype: The type of the output.
-//	shape:     The purported shape of the output. This is only used for shape inference;
-//     the output will not necessarily have this shape. Can be a partial shape.
+//	input_sizes: An integer vector representing the shape of `input`, based
+// on `data_format`.  For example, if `data_format` is 'NHWC' then
+//  `input` is a 4-D `[batch, height, width, channels]` tensor.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
 //
-// Returns     \"Fake\" output value. This should not be consumed by another op.
-func FakeParam(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
+// Returns 4-D with shape according to `data_format`.  For example, if
+// `data_format` is 'NHWC', output shape is `[batch, in_height,
+// in_width, in_channels]`.  Gradient w.r.t. the input of the
+// convolution.
+func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropInputAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "FakeParam",
-
+		Type: "DepthwiseConv2dNativeBackpropInput",
+		Input: []tf.Input{
+			input_sizes, filter, out_backprop,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
+// ArgMinAttr is an optional argument to ArgMin.
+type ArgMinAttr func(optionalAttr)
+
+// ArgMinOutputType sets the optional output_type attribute to value.
+// If not specified, defaults to DT_INT64
+func ArgMinOutputType(value tf.DataType) ArgMinAttr {
+	return func(m optionalAttr) {
+		m["output_type"] = value
+	}
+}
+
+// Returns the index with the smallest value across dimensions of a tensor.
+//
+// Note that in case of ties the identity of the return value is not guaranteed.
+//
+// Usage:
+//   ```python
+//   import tensorflow as tf
+//   a = [1, 10, 26.9, 2.8, 166.32, 62.3]
+//   b = tf.math.argmin(input = a)
+//   c = tf.keras.backend.eval(b)
+//   # c = 0
+//   # here a[0] = 1 which is the smallest element of a across axis 0
+//   ```
+//
+// Arguments:
+//
+//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+// Describes which dimension of the input Tensor to reduce across. For vectors,
+// use dimension = 0.
+func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMinAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ArgMin",
+		Input: []tf.Input{
+			input, dimension,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
+type FusedResizeAndPadConv2DAttr func(optionalAttr)
+
+// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
+	return func(m optionalAttr) {
+		m["resize_align_corners"] = value
+	}
+}
+
+// Performs a resize and padding as a preprocess during a convolution.
+//
+// It's often possible to do spatial transformations more efficiently as part of
+// the packing stage of a convolution, so this op allows for an optimized
+// implementation where these stages are fused together. This prevents the need to
+// write out the intermediate results as whole tensors, reducing memory pressure,
+// and we can get some latency gains by merging the transformation calculations.
+// The data_format attribute for Conv2D isn't supported by this op, and defaults to
+// 'NHWC' order.
+// Internally this op uses a single per-graph scratch buffer, which means that it
+// will block if multiple versions are being run in parallel. This is because this
+// operator is primarily an optimization to minimize memory usage.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`. Must be in the same order as the dimension specified with format.
+//	padding: The type of padding algorithm to use.
+func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FusedResizeAndPadConv2D",
+		Input: []tf.Input{
+			input, size, paddings, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPoolGradAttr is an optional argument to MaxPoolGrad.
+type MaxPoolGradAttr func(optionalAttr)
+
+// MaxPoolGradDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradDataFormat(value string) MaxPoolGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of the maxpooling function.
+//
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients w.r.t. the input to `max_pool`.
+func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes hyperbolic sine of x element-wise.
+func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sinh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// L2 Loss.
+//
+// Computes half the L2 norm of a tensor without the `sqrt`:
+//
+//     output = sum(t ** 2) / 2
+//
+// Arguments:
+//	t: Typically 2-D, but may have any dimensions.
+//
+// Returns 0-D.
+func L2Loss(scope *Scope, t tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "L2Loss",
+		Input: []tf.Input{
+			t,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Decodes a `variant` Tensor into a `RaggedTensor`.
+//
+// Decodes the given `variant` Tensor and returns a `RaggedTensor`. The input
+// could be a scalar, meaning it encodes a single `RaggedTensor` with ragged_rank
+// `output_ragged_rank`. It could also have an arbitrary rank, in which case each
+// element is decoded into a `RaggedTensor` with ragged_rank `input_ragged_rank`
+// and these are then stacked according to the input shape to output a single
+// `RaggedTensor` with ragged_rank `output_ragged_rank`. Each `variant` element in
+// the input Tensor is decoded by retrieving from the element a 1-D `variant`
+// Tensor with `input_ragged_rank + 1` Tensors, corresponding to the splits and
+// values of the decoded `RaggedTensor`. If `input_ragged_rank` is -1, then it is
+// inferred as `output_ragged_rank` - `rank(encoded_ragged)`. See
+// `RaggedTensorToVariant` for the corresponding encoding logic.
+//
+//
+// Arguments:
+//	encoded_ragged: A `variant` Tensor containing encoded `RaggedTensor`s.
+//	input_ragged_rank: The ragged rank of each encoded `RaggedTensor` component in the input. If set to
+// -1, this is inferred as `output_ragged_rank` - `rank(encoded_ragged)`
+//	output_ragged_rank: The expected ragged rank of the output `RaggedTensor`. The following must hold:
+// `output_ragged_rank = rank(encoded_ragged) + input_ragged_rank`.
+//
+//
+//
+// Returns A list of one or more Tensors representing the splits of the output
+// `RaggedTensor`.A Tensor representing the values of the output `RaggedTensor`.
+func RaggedTensorFromVariant(scope *Scope, encoded_ragged tf.Output, input_ragged_rank int64, output_ragged_rank int64, Tvalues tf.DataType, Tsplits tf.DataType) (output_nested_splits []tf.Output, output_dense_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"input_ragged_rank": input_ragged_rank, "output_ragged_rank": output_ragged_rank, "Tvalues": Tvalues, "Tsplits": Tsplits}
+	opspec := tf.OpSpec{
+		Type: "RaggedTensorFromVariant",
+		Input: []tf.Input{
+			encoded_ragged,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output_nested_splits, idx, err = makeOutputList(op, idx, "output_nested_splits"); err != nil {
+		scope.UpdateErr("RaggedTensorFromVariant", err)
+		return
+	}
+	output_dense_values = op.Output(idx)
+	return output_nested_splits, output_dense_values
+}
+
+// Conv2DBackpropInputAttr is an optional argument to Conv2DBackpropInput.
+type Conv2DBackpropInputAttr func(optionalAttr)
+
+// Conv2DBackpropInputUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
+	}
+}
+
+// Conv2DBackpropInputExplicitPaddings sets the optional explicit_paddings attribute to value.
+//
+// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+// dimension, the amount of padding inserted before and after the dimension is
+// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
+// If not specified, defaults to <>
+func Conv2DBackpropInputExplicitPaddings(value []int64) Conv2DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
+// Conv2DBackpropInputDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv2DBackpropInputDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of convolution with respect to the input.
+//
+// Arguments:
+//	input_sizes: An integer vector representing the shape of `input`,
+// where `input` is a 4-D `[batch, height, width, channels]` tensor.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
+// w.r.t. the input of the convolution.
+func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropInputAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv2DBackpropInput",
+		Input: []tf.Input{
+			input_sizes, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ApproximateEqualAttr is an optional argument to ApproximateEqual.
+type ApproximateEqualAttr func(optionalAttr)
+
+// ApproximateEqualTolerance sets the optional tolerance attribute to value.
+// If not specified, defaults to 1e-05
+func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
+	return func(m optionalAttr) {
+		m["tolerance"] = value
+	}
+}
+
+// Returns the truth value of abs(x-y) < tolerance element-wise.
+func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ApproximateEqual",
+		Input: []tf.Input{
+			x, y,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Shuts down a running distributed TPU system.
+//
+// The op returns an error if no system is running.
+//
+// Returns the created operation.
+func ShutdownDistributedTPU(scope *Scope) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ShutdownDistributedTPU",
+	}
+	return scope.AddOperation(opspec)
+}
+
+// InfeedEnqueuePrelinearizedBufferAttr is an optional argument to InfeedEnqueuePrelinearizedBuffer.
+type InfeedEnqueuePrelinearizedBufferAttr func(optionalAttr)
+
+// InfeedEnqueuePrelinearizedBufferDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. This should be -1 when the Op is running on a TPU device
+// and = 0 when the Op is running on the CPU device.
+// If not specified, defaults to -1
+func InfeedEnqueuePrelinearizedBufferDeviceOrdinal(value int64) InfeedEnqueuePrelinearizedBufferAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// An op which enqueues prelinearized buffer into TPU infeed.
+//
+// Arguments:
+//	input: A variant tensor representing linearized output.
+//
+// Returns the created operation.
+func InfeedEnqueuePrelinearizedBuffer(scope *Scope, input tf.Output, optional ...InfeedEnqueuePrelinearizedBufferAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "InfeedEnqueuePrelinearizedBuffer",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Creates a dataset that skips `count` elements from the `input_dataset`.
+//
+// Arguments:
+//
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be skipped.  If count is -1, skips everything.
+//
+//
+func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "SkipDataset",
+		Input: []tf.Input{
+			input_dataset, count,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Transforms a Tensor into a serialized TensorProto proto.
+//
+// Arguments:
+//	tensor: A Tensor of type `T`.
+//
+// Returns A serialized TensorProto proto of the input tensor.
+func SerializeTensor(scope *Scope, tensor tf.Output) (serialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SerializeTensor",
+		Input: []tf.Input{
+			tensor,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TensorArrayV2Attr is an optional argument to TensorArrayV2.
+type TensorArrayV2Attr func(optionalAttr)
+
+// TensorArrayV2ElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// TensorArrayV2DynamicSize sets the optional dynamic_size attribute to value.
+// If not specified, defaults to false
+func TensorArrayV2DynamicSize(value bool) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["dynamic_size"] = value
+	}
+}
+
+// TensorArrayV2ClearAfterRead sets the optional clear_after_read attribute to value.
+// If not specified, defaults to true
+func TensorArrayV2ClearAfterRead(value bool) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["clear_after_read"] = value
+	}
+}
+
+// TensorArrayV2TensorArrayName sets the optional tensor_array_name attribute to value.
+// If not specified, defaults to ""
+func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["tensor_array_name"] = value
+	}
+}
+
+// Deprecated. Use TensorArrayV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArrayV3
+func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayV2",
+		Input: []tf.Input{
+			size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Provides the time since epoch in seconds.
+//
+// Returns the timestamp as a `float64` for seconds since the Unix epoch.
+//
+// Note: the timestamp is computed when the op is executed, not when it is added
+// to the graph.
+func Timestamp(scope *Scope) (ts tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Timestamp",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
+//
+// Arguments:
+//	serialized: A scalar string containing a serialized TensorProto proto.
+//	out_type: The type of the serialized tensor.  The provided type must match the
+// type of the serialized tensor and no implicit conversion will take place.
+//
+// Returns A Tensor of type `out_type`.
+func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	opspec := tf.OpSpec{
+		Type: "ParseTensor",
+		Input: []tf.Input{
+			serialized,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes rectified linear 6 gradients for a Relu6 operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
+//	features: The features passed as input to the corresponding Relu6 operation, or
+// its output; using either one produces the same result.
+//
+// Returns The gradients:
+// `gradients * (features > 0) * (features < 6)`.
+func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Relu6Grad",
+		Input: []tf.Input{
+			gradients, features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DecodeCompressedAttr is an optional argument to DecodeCompressed.
+type DecodeCompressedAttr func(optionalAttr)
+
+// DecodeCompressedCompressionType sets the optional compression_type attribute to value.
+//
+// value: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+// If not specified, defaults to ""
+func DecodeCompressedCompressionType(value string) DecodeCompressedAttr {
+	return func(m optionalAttr) {
+		m["compression_type"] = value
+	}
+}
+
+// Decompress strings.
+//
+// This op decompresses each element of the `bytes` input `Tensor`, which
+// is assumed to be compressed using the given `compression_type`.
+//
+// The `output` is a string `Tensor` of the same shape as `bytes`,
+// each element containing the decompressed data from the corresponding
+// element in `bytes`.
+//
+// Arguments:
+//	bytes: A Tensor of string which is compressed.
+//
+// Returns A Tensor with the same shape as input `bytes`, uncompressed
+// from bytes.
+func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompressedAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeCompressed",
+		Input: []tf.Input{
+			bytes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyProximalGradientDescentAttr is an optional argument to ResourceSparseApplyProximalGradientDescent.
+type ResourceSparseApplyProximalGradientDescentAttr func(optionalAttr)
+
+// ResourceSparseApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyProximalGradientDescentUseLocking(value bool) ResourceSparseApplyProximalGradientDescentAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Sparse update '*var' as FOBOS algorithm with fixed learning rate.
+//
+// That is for rows we have grad for, we update var as follows:
+// prox_v = var - alpha * grad
+// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalGradientDescentAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyProximalGradientDescent",
+		Input: []tf.Input{
+			var_, alpha, l1, l2, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns the gradient of `Tile`.
+//
+// DEPRECATED at GraphDef version 3: TileGrad has been replaced with reduce_sum
+//
+// Since `Tile` takes an input and repeats the input `multiples` times
+// along each dimension, `TileGrad` takes in `multiples` and aggregates
+// each repeated tile of `input` into `output`.
+func TileGrad(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TileGrad",
+		Input: []tf.Input{
+			input, multiples,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ParseSingleSequenceExampleAttr is an optional argument to ParseSingleSequenceExample.
 type ParseSingleSequenceExampleAttr func(optionalAttr)
 
@@ -9367,27 +11243,122 @@ func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list
 	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values
 }
 
-// Creates a dataset that asynchronously prefetches elements from `input_dataset`.
+// Flips all bits elementwise.
 //
-// Arguments:
-//
-//	buffer_size: The maximum number of elements to buffer in an iterator over
-// this dataset.
-//
-//
-func PrefetchDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// The result will have exactly those bits set, that are not set in `x`. The
+// computation is performed on the underlying representation of x.
+func Invert(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "PrefetchDataset",
+		Type: "Invert",
 		Input: []tf.Input{
-			input_dataset, buffer_size,
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Generate the bucket boundaries for each feature based on accumulated summaries.
+//
+// An op that returns a list of float tensors for a quantile stream resource. Each
+// tensor is Rank 1 containing bucket boundaries for a single feature.
+//
+// Arguments:
+//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
+//	num_features: inferred int; number of features to get bucket boundaries for.
+//
+// Returns float; List of Rank 1 Tensors each containing the bucket boundaries for a feature.
+func BoostedTreesQuantileStreamResourceGetBucketBoundaries(scope *Scope, quantile_stream_resource_handle tf.Output, num_features int64) (bucket_boundaries []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_features": num_features}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesQuantileStreamResourceGetBucketBoundaries",
+		Input: []tf.Input{
+			quantile_stream_resource_handle,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if bucket_boundaries, idx, err = makeOutputList(op, idx, "bucket_boundaries"); err != nil {
+		scope.UpdateErr("BoostedTreesQuantileStreamResourceGetBucketBoundaries", err)
+		return
+	}
+	return bucket_boundaries
+}
+
+// Encodes a `RaggedTensor` into a `variant` Tensor.
+//
+//
+// Encodes the given `RaggedTensor` and returns a `variant` Tensor. If
+// `batched_input` is True, then input `RaggedTensor` is unbatched along the
+// zero-th dimension, each component `RaggedTensor` is encoded into a scalar
+// `variant` Tensor, and these are stacked to return a 1-D `variant` Tensor.
+// If `batched_input` is False, then the input `RaggedTensor` is encoded as is and
+// a scalar `variant` Tensor is returned. A `RaggedTensor` is encoded by first
+// creating a 1-D `variant` Tensor with `ragged_rank + 1` elements, containing the
+// splits and values Tensors of the `RaggedTensor`. Then the 1-D `variant` Tensor
+// is wrapped in a scalar `variant` Tensor. See `RaggedTensorFromVariant` for the
+// corresponding decoding logic.
+//
+//
+// Arguments:
+//	rt_nested_splits: A list of one or more Tensors representing the splits of the input
+// `RaggedTensor`.
+//	rt_dense_values: A Tensor representing the values of the input `RaggedTensor`.
+//	batched_input: A `bool` denoting whether the input is a batched `RaggedTensor`.
+//
+// Returns A `variant` Tensor that containing encoded `RaggedTensor`.
+func RaggedTensorToVariant(scope *Scope, rt_nested_splits []tf.Output, rt_dense_values tf.Output, batched_input bool) (encoded_ragged tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"batched_input": batched_input}
+	opspec := tf.OpSpec{
+		Type: "RaggedTensorToVariant",
+		Input: []tf.Input{
+			tf.OutputList(rt_nested_splits), rt_dense_values,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Fast Fourier transform.
+//
+// Computes the 1-dimensional discrete Fourier transform over the inner-most
+// dimension of `input`.
+//
+// Arguments:
+//	input: A complex tensor.
+//
+// Returns A complex tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fft
+// @end_compatibility
+func FFT(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "FFT",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
@@ -9416,19 +11387,148 @@ func RaggedTensorToSparse(scope *Scope, rt_nested_splits []tf.Output, rt_dense_v
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
+// RandomPoissonV2Attr is an optional argument to RandomPoissonV2.
+type RandomPoissonV2Attr func(optionalAttr)
+
+// RandomPoissonV2Seed sets the optional seed attribute to value.
 //
-// true, this follows Python semantics in that the result here is consistent
-// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomPoissonV2Seed(value int64) RandomPoissonV2Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomPoissonV2Seed2 sets the optional seed2 attribute to value.
 //
-// *NOTE*: `FloorMod` supports broadcasting. More about broadcasting
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomPoissonV2Seed2(value int64) RandomPoissonV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// RandomPoissonV2Dtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func RandomPoissonV2Dtype(value tf.DataType) RandomPoissonV2Attr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs random values from the Poisson distribution(s) described by rate.
+//
+// This op uses two algorithms, depending on rate. If rate >= 10, then
+// the algorithm by Hormann is used to acquire samples via
+// transformation-rejection.
+// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
+//
+// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
+// random variables.
+// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
+// Programming, Volume 2. Addison Wesley
+//
+// Arguments:
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in rate.
+//	rate: A tensor in which each scalar is a "rate" parameter describing the
+// associated poisson distribution.
+//
+// Returns A tensor with shape `shape + shape(rate)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `rate[i0, i1, ...iN]`.
+func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomPoissonV2",
+		Input: []tf.Input{
+			shape, rate,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RandomGammaAttr is an optional argument to RandomGamma.
+type RandomGammaAttr func(optionalAttr)
+
+// RandomGammaSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomGammaSeed(value int64) RandomGammaAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomGammaSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomGammaSeed2(value int64) RandomGammaAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from the Gamma distribution(s) described by alpha.
+//
+// This op uses the algorithm by Marsaglia et al. to acquire samples via
+// transformation-rejection from pairs of uniform and normal random variables.
+// See http://dl.acm.org/citation.cfm?id=358414
+//
+// Arguments:
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in alpha.
+//	alpha: A tensor in which each scalar is a "shape" parameter describing the
+// associated gamma distribution.
+//
+// Returns A tensor with shape `shape + shape(alpha)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
+func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomGamma",
+		Input: []tf.Input{
+			shape, alpha,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of (x < y) element-wise.
+//
+// *NOTE*: `Less` supports broadcasting. More about broadcasting
 // [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FloorMod",
+		Type: "Less",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -9437,64 +11537,80 @@ func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// Computes softsign: `features / (abs(features) + 1)`.
-func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
+// Adjust the hue of one or more images.
+//
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpretted as channels, and must be three.
+//
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A delta is then applied all the hue values,
+// and then remapped back to RGB colorspace.
+//
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	delta: A float delta to add to the hue.
+//
+// Returns The hue-adjusted image or images.
+func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Softsign",
+		Type: "AdjustHue",
 		Input: []tf.Input{
-			features,
+			images, delta,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResizeNearestNeighborGradAttr is an optional argument to ResizeNearestNeighborGrad.
-type ResizeNearestNeighborGradAttr func(optionalAttr)
+// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
+type RandomStandardNormalAttr func(optionalAttr)
 
-// ResizeNearestNeighborGradAlignCorners sets the optional align_corners attribute to value.
+// RandomStandardNormalSeed sets the optional seed attribute to value.
 //
-// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
-// aligned. Defaults to false.
-// If not specified, defaults to false
-func ResizeNearestNeighborGradAlignCorners(value bool) ResizeNearestNeighborGradAttr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["seed"] = value
 	}
 }
 
-// ResizeNearestNeighborGradHalfPixelCenters sets the optional half_pixel_centers attribute to value.
-// If not specified, defaults to false
-func ResizeNearestNeighborGradHalfPixelCenters(value bool) ResizeNearestNeighborGradAttr {
+// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
 	return func(m optionalAttr) {
-		m["half_pixel_centers"] = value
+		m["seed2"] = value
 	}
 }
 
-// Computes the gradient of nearest neighbor interpolation.
+// Outputs random values from a normal distribution.
+//
+// The generated values will have mean 0 and standard deviation 1.
 //
 // Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `orig_height, orig_width`. The
-// original input size.
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
 //
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`. Gradients
-// with respect to the input image.
-func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, optional ...ResizeNearestNeighborGradAttr) (output tf.Output) {
+// Returns A tensor of the specified shape filled with random normal values.
+func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeNearestNeighborGrad",
+		Type: "RandomStandardNormal",
 		Input: []tf.Input{
-			grads, size,
+			shape,
 		},
 		Attrs: attrs,
 	}
@@ -9502,162 +11618,60 @@ func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, op
 	return op.Output(0)
 }
 
-// Reshapes a quantized tensor as per the Reshape op.
-//
-// ```
-//
-// Arguments:
-//
-//	shape: Defines the shape of the output tensor.
-//	input_min: The minimum value of the input.
-//	input_max: The maximum value of the input.
-//
-// Returns This value is copied from input_min.This value is copied from input_max.
-func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min tf.Output, input_max tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedReshape",
-		Input: []tf.Input{
-			tensor, shape, input_min, input_max,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
+// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
+type FusedBatchNormAttr func(optionalAttr)
 
-// Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
+// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
 //
-// if < 0, `scale * features` otherwise.
-//
-// To be used together with
-// `initializer = tf.variance_scaling_initializer(factor=1.0, mode='FAN_IN')`.
-// For correct dropout, use `tf.contrib.nn.alpha_dropout`.
-//
-// See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
-func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Selu",
-		Input: []tf.Input{
-			features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LeakyReluGradAttr is an optional argument to LeakyReluGrad.
-type LeakyReluGradAttr func(optionalAttr)
-
-// LeakyReluGradAlpha sets the optional alpha attribute to value.
-// If not specified, defaults to 0.2
-func LeakyReluGradAlpha(value float32) LeakyReluGradAttr {
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
 	return func(m optionalAttr) {
-		m["alpha"] = value
+		m["epsilon"] = value
 	}
 }
 
-// Computes rectified linear gradients for a LeakyRelu operation.
+// FusedBatchNormDataFormat sets the optional data_format attribute to value.
 //
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding LeakyRelu operation.
-//	features: The features passed as input to the corresponding LeakyRelu operation,
-// OR the outputs of that operation (both work equivalently).
-//
-// Returns `gradients * (features > 0) + alpha * gradients * (featurs <= 0)`.
-func LeakyReluGrad(scope *Scope, gradients tf.Output, features tf.Output, optional ...LeakyReluGradAttr) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LeakyReluGrad",
-		Input: []tf.Input{
-			gradients, features,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MaxPoolGradAttr is an optional argument to MaxPoolGrad.
-type MaxPoolGradAttr func(optionalAttr)
-
-// MaxPoolGradDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
+// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
 // If not specified, defaults to "NHWC"
-func MaxPoolGradDataFormat(value string) MaxPoolGradAttr {
+func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// Computes gradients of the maxpooling function.
+// FusedBatchNormIsTraining sets the optional is_training attribute to value.
+//
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Batch normalization.
+//
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
 //
-// Returns Gradients w.r.t. the input to `max_pool`.
-func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolGrad",
-		Input: []tf.Input{
-			orig_input, orig_output, grad,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// VariableShapeAttr is an optional argument to VariableShape.
-type VariableShapeAttr func(optionalAttr)
-
-// VariableShapeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func VariableShapeOutType(value tf.DataType) VariableShapeAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Returns the shape of the variable pointed to by `resource`.
-//
-// This operation returns a 1-D integer tensor representing the shape of `input`.
-//
-// For example:
-//
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// shape(t) ==> [2, 2, 3]
-// ```
-func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr) (output tf.Output) {
+// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be reused in the gradient computation.
+func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9666,7 +11680,299 @@ func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "VariableShape",
+		Type: "FusedBatchNorm",
+		Input: []tf.Input{
+			x, scale, offset, mean, variance,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// RandomUniformIntAttr is an optional argument to RandomUniformInt.
+type RandomUniformIntAttr func(optionalAttr)
+
+// RandomUniformIntSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random integers from a uniform distribution.
+//
+// The generated values are uniform integers in the range `[minval, maxval)`.
+// The lower bound `minval` is included in the range, while the upper bound
+// `maxval` is excluded.
+//
+// The random integers are slightly biased unless `maxval - minval` is an exact
+// power of two.  The bias is small for values of `maxval - minval` significantly
+// smaller than the range of the output (either `2^32` or `2^64`).
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	minval: 0-D.  Inclusive lower bound on the generated integers.
+//	maxval: 0-D.  Exclusive upper bound on the generated integers.
+//
+// Returns A tensor of the specified shape filled with uniform random integers.
+func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomUniformInt",
+		Input: []tf.Input{
+			shape, minval, maxval,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingFTRLParametersAttr is an optional argument to LoadTPUEmbeddingFTRLParameters.
+type LoadTPUEmbeddingFTRLParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingFTRLParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingFTRLParametersTableId(value int64) LoadTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingFTRLParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingFTRLParametersTableName(value string) LoadTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load FTRL embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the FTRL optimization algorithm.
+//	accumulators: Value of accumulators used in the FTRL optimization algorithm.
+//	linears: Value of linears used in the FTRL optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, linears tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingFTRLParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingFTRLParameters",
+		Input: []tf.Input{
+			parameters, accumulators, linears,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes the log of the absolute value of `Gamma(x)` element-wise.
+func Lgamma(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Lgamma",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Advance the counter of a counter-based RNG.
+//
+// The state of the RNG after
+// `rng_skip(n)` will be the same as that after `stateful_uniform([n])`
+// (or any other distribution). The actual increment added to the
+// counter is an unspecified implementation detail.
+//
+// Arguments:
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	algorithm: The RNG algorithm.
+//	delta: The amount of advancement.
+//
+// Returns the created operation.
+func RngSkip(scope *Scope, resource tf.Output, algorithm tf.Output, delta tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RngSkip",
+		Input: []tf.Input{
+			resource, algorithm, delta,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// QuantizedDepthwiseConv2DAttr is an optional argument to QuantizedDepthwiseConv2D.
+type QuantizedDepthwiseConv2DAttr func(optionalAttr)
+
+// QuantizedDepthwiseConv2DOutType sets the optional out_type attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_QINT32
+func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2DAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
+//
+// value: List of dilation values.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes quantized depthwise Conv2D.
+//
+// Arguments:
+//	input: The original input tensor.
+//	filter: The original filter tensor.
+//	min_input: The float value that the minimum quantized input value represents.
+//	max_input: The float value that the maximum quantized input value represents.
+//	min_filter: The float value that the minimum quantized filter value represents.
+//	max_filter: The float value that the maximum quantized filter value represents.
+//	strides: List of stride values.
+//
+//
+// Returns The output tensor.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizedDepthwiseConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedDepthwiseConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedDepthwiseConv2D",
+		Input: []tf.Input{
+			input, filter, min_input, max_input, min_filter, max_filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Outputs random integers from a uniform distribution.
+//
+// The generated values are uniform integers in the range `[minval, maxval)`.
+// The lower bound `minval` is included in the range, while the upper bound
+// `maxval` is excluded.
+//
+// The random integers are slightly biased unless `maxval - minval` is an exact
+// power of two.  The bias is small for values of `maxval - minval` significantly
+// smaller than the range of the output (either `2^32` or `2^64`).
+//
+// Arguments:
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	algorithm: The RNG algorithm.
+//	shape: The shape of the output tensor.
+//	minval: Minimum value (inclusive, scalar).
+//	maxval: Maximum value (exclusive, scalar).
+//
+// Returns Random values with specified shape.
+func StatefulUniformInt(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StatefulUniformInt",
+		Input: []tf.Input{
+			resource, algorithm, shape, minval, maxval,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AngleAttr is an optional argument to Angle.
+type AngleAttr func(optionalAttr)
+
+// AngleTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func AngleTout(value tf.DataType) AngleAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Returns the argument of a complex number.
+//
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the argument of each element in `input`. All elements in
+// `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part.
+//
+// The argument returned by this operation is of the form \\(atan2(b, a)\\).
+//
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.angle(input) ==> [2.0132, 1.056]
+// ```
+//
+// @compatibility(numpy)
+// Equivalent to np.angle.
+// @end_compatibility
+func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Angle",
 		Input: []tf.Input{
 			input,
 		},
@@ -9676,119 +11982,32 @@ func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr)
 	return op.Output(0)
 }
 
-// Returns the set of files matching one or more glob patterns.
-//
-// Note that this routine only supports wildcard characters in the
-// basename portion of the pattern, not in the directory portion.
-// Note also that the order of filenames returned can be non-deterministic.
-//
-// Arguments:
-//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
-//
-// Returns A vector of matching filenames.
-func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatchingFiles",
-		Input: []tf.Input{
-			pattern,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// StatefulTruncatedNormalAttr is an optional argument to StatefulTruncatedNormal.
+type StatefulTruncatedNormalAttr func(optionalAttr)
 
-// MaxPool3DGradGradAttr is an optional argument to MaxPool3DGradGrad.
-type MaxPool3DGradGradAttr func(optionalAttr)
-
-// MaxPool3DGradGradDataFormat sets the optional data_format attribute to value.
+// StatefulTruncatedNormalDtype sets the optional dtype attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DGradGradDataFormat(value string) MaxPool3DGradGradAttr {
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatefulTruncatedNormalDtype(value tf.DataType) StatefulTruncatedNormalAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["dtype"] = value
 	}
 }
 
-// Computes second-order gradients of the maxpooling function.
+// Outputs random values from a truncated normal distribution.
+//
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	algorithm: The RNG algorithm.
+//	shape: The shape of the output tensor.
 //
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPool3DGradGrad",
-		Input: []tf.Input{
-			orig_input, orig_output, grad,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
-type FractionalAvgPoolGradAttr func(optionalAttr)
-
-// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
-//
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [41/3, 26/3] for fractional avg pooling.
-// If not specified, defaults to false
-func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
-	}
-}
-
-// Computes gradient of the FractionalAvgPool function.
-//
-// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
-// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
-// out_backprop to those indices that form the same pooling cell. Therefore, we
-// just need to know the shape of original input tensor, instead of the whole
-// tensor.
-//
-// Arguments:
-//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
-//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-// w.r.t. the output of `fractional_avg_pool`.
-//	row_pooling_sequence: row pooling sequence, form pooling region with
-// col_pooling_sequence.
-//	col_pooling_sequence: column pooling sequence, form pooling region with
-// row_pooling sequence.
-//
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
-func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
+// Returns Random values with specified shape.
+func StatefulTruncatedNormal(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulTruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9797,9 +12016,9 @@ func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FractionalAvgPoolGrad",
+		Type: "StatefulTruncatedNormal",
 		Input: []tf.Input{
-			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
+			resource, algorithm, shape,
 		},
 		Attrs: attrs,
 	}
@@ -9807,107 +12026,140 @@ func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_
 	return op.Output(0)
 }
 
-// Computes Psi, the derivative of Lgamma (the log of the absolute value of
+// FIFOQueueV2Attr is an optional argument to FIFOQueueV2.
+type FIFOQueueV2Attr func(optionalAttr)
+
+// FIFOQueueV2Shapes sets the optional shapes attribute to value.
 //
-// `Gamma(x)`), element-wise.
-func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func FIFOQueueV2Shapes(value []tf.Shape) FIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shapes"] = value
+	}
+}
+
+// FIFOQueueV2Capacity sets the optional capacity attribute to value.
+//
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func FIFOQueueV2Capacity(value int64) FIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// FIFOQueueV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func FIFOQueueV2Container(value string) FIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// FIFOQueueV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func FIFOQueueV2SharedName(value string) FIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A queue that produces elements in first-in first-out order.
+//
+// Arguments:
+//	component_types: The type of each component in a value.
+//
+// Returns The handle to the queue.
+func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Digamma",
-		Input: []tf.Input{
-			x,
-		},
+		Type: "FIFOQueueV2",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns locations of nonzero / true values in a tensor.
+// IdentityReaderV2Attr is an optional argument to IdentityReaderV2.
+type IdentityReaderV2Attr func(optionalAttr)
+
+// IdentityReaderV2Container sets the optional container attribute to value.
 //
-// This operation returns the coordinates of true elements in `condition`. The
-// coordinates are returned in a 2-D tensor where the first dimension (rows)
-// represents the number of true elements, and the second dimension (columns)
-// represents the coordinates of the true elements. Keep in mind, the shape of
-// the output tensor can vary depending on how many true values there are in
-// `condition`. Indices are output in row-major order.
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func IdentityReaderV2Container(value string) IdentityReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// IdentityReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// For example:
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func IdentityReaderV2SharedName(value string) IdentityReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the queued work as both the key and value.
 //
-// ```
-// # 'input' tensor is [[True, False]
-// #                    [True, False]]
-// # 'input' has two true values, so output has two coordinates.
-// # 'input' has rank of 2, so coordinates have two indices.
-// where(input) ==> [[0, 0],
-//                   [1, 0]]
+// To use, enqueue strings in a Queue.  ReaderRead will take the front
+// work string and output (work, work).
 //
-// # `condition` tensor is [[[True, False]
-// #                     [True, False]]
-// #                    [[False, True]
-// #                     [False, True]]
-// #                    [[False, False]
-// #                     [False, True]]]
-// # 'input' has 5 true values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
-//
-// # `condition` tensor is [[[1.5,  0.0]
-// #                     [-0.5, 0.0]]
-// #                    [[0.0,  0.25]
-// #                     [0.0,  0.75]]
-// #                    [[0.0,  0.0]
-// #                     [0.0,  0.01]]]
-// # 'input' has 5 nonzero values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
-//
-// # `condition` tensor is [[[1.5 + 0.0j, 0.0  + 0.0j]
-// #                     [0.0 + 0.5j, 0.0  + 0.0j]]
-// #                    [[0.0 + 0.0j, 0.25 + 1.5j]
-// #                     [0.0 + 0.0j, 0.75 + 0.0j]]
-// #                    [[0.0 + 0.0j, 0.0  + 0.0j]
-// #                     [0.0 + 0.0j, 0.01 + 0.0j]]]
-// # 'input' has 5 nonzero magnitude values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
-// ```
-func Where(scope *Scope, condition tf.Output) (index tf.Output) {
+// Returns The handle to reference the Reader.
+func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Where",
-		Input: []tf.Input{
-			condition,
-		},
+		Type: "IdentityReaderV2",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the name of the device on which `resource` has been placed.
-func ExperimentalIteratorGetDevice(scope *Scope, resource tf.Output) (device tf.Output) {
+// Returns the truth value of (x <= y) element-wise.
+//
+// *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalIteratorGetDevice",
+		Type: "LessEqual",
 		Input: []tf.Input{
-			resource,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -9962,53 +12214,151 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 	return op.Output(0)
 }
 
-// CTCBeamSearchDecoderAttr is an optional argument to CTCBeamSearchDecoder.
-type CTCBeamSearchDecoderAttr func(optionalAttr)
-
-// CTCBeamSearchDecoderMergeRepeated sets the optional merge_repeated attribute to value.
-//
-// value: If true, merge repeated classes in output.
-// If not specified, defaults to true
-func CTCBeamSearchDecoderMergeRepeated(value bool) CTCBeamSearchDecoderAttr {
-	return func(m optionalAttr) {
-		m["merge_repeated"] = value
-	}
-}
-
-// Performs beam search decoding on the logits given in input.
-//
-// A note about the attribute merge_repeated: For the beam search decoder,
-// this means that if consecutive entries in a beam are the same, only
-// the first of these is emitted.  That is, when the top path is "A B B B B",
-// "A B" is returned if merge_repeated = True but "A B B B B" is
-// returned if merge_repeated = False.
-//
-// Arguments:
-//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
-//	sequence_length: A vector containing sequence lengths, size `(batch)`.
-//	beam_width: A scalar >= 0 (beam search beam width).
-//	top_paths: A scalar >= 0, <= beam_width (controls output size).
-//
-// Returns A list (length: top_paths) of indices matrices.  Matrix j,
-// size `(total_decoded_outputs[j] x 2)`, has indices of a
-// `SparseTensor<int64, 2>`.  The rows store: [batch, time].A list (length: top_paths) of values vectors.  Vector j,
-// size `(length total_decoded_outputs[j])`, has the values of a
-// `SparseTensor<int64, 2>`.  The vector stores the decoded classes for beam j.A list (length: top_paths) of shape vector.  Vector j,
-// size `(2)`, stores the shape of the decoded `SparseTensor[j]`.
-// Its values are: `[batch_size, max_decoded_length[j]]`.A matrix, shaped: `(batch_size x top_paths)`.  The
-// sequence log-probabilities.
-func CTCBeamSearchDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, beam_width int64, top_paths int64, optional ...CTCBeamSearchDecoderAttr) (decoded_indices []tf.Output, decoded_values []tf.Output, decoded_shape []tf.Output, log_probability tf.Output) {
+// Computes softplus: `log(exp(features) + 1)`.
+func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"beam_width": beam_width, "top_paths": top_paths}
+	opspec := tf.OpSpec{
+		Type: "Softplus",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SkipgramAttr is an optional argument to Skipgram.
+type SkipgramAttr func(optionalAttr)
+
+// SkipgramWindowSize sets the optional window_size attribute to value.
+//
+// value: The number of words to predict to the left and right of the target.
+// If not specified, defaults to 5
+func SkipgramWindowSize(value int64) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["window_size"] = value
+	}
+}
+
+// SkipgramMinCount sets the optional min_count attribute to value.
+//
+// value: The minimum number of word occurrences for it to be included in the
+// vocabulary.
+// If not specified, defaults to 5
+func SkipgramMinCount(value int64) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["min_count"] = value
+	}
+}
+
+// SkipgramSubsample sets the optional subsample attribute to value.
+//
+// value: Threshold for word occurrence. Words that appear with higher
+// frequency will be randomly down-sampled. Set to 0 to disable.
+// If not specified, defaults to 0.001
+func SkipgramSubsample(value float32) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["subsample"] = value
+	}
+}
+
+// Parses a text file and creates a batch of examples.
+//
+// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
+//
+// Arguments:
+//	filename: The corpus's text file name.
+//	batch_size: The size of produced batch.
+//
+// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
+func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CTCBeamSearchDecoder",
+		Type: "Skipgram",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
+}
+
+// StatefulUniformAttr is an optional argument to StatefulUniform.
+type StatefulUniformAttr func(optionalAttr)
+
+// StatefulUniformDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatefulUniformDtype(value tf.DataType) StatefulUniformAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs random values from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+//
+// Arguments:
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	algorithm: The RNG algorithm.
+//	shape: The shape of the output tensor.
+//
+// Returns Random values with specified shape.
+func StatefulUniform(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulUniformAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatefulUniform",
 		Input: []tf.Input{
-			inputs, sequence_length,
+			resource, algorithm, shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Execute a sub graph on a remote processor.
+//
+// The graph specifications(such as graph itself, input tensors and output names)
+// are stored as a serialized protocol buffer of RemoteFusedGraphExecuteInfo
+// as serialized_remote_fused_graph_execute_info.
+// The specifications will be passed to a dedicated registered
+// remote fused graph executor.  The executor will send the graph specifications
+// to a remote processor and execute that graph.  The execution results
+// will be passed to consumer nodes as outputs of this node.
+//
+// Arguments:
+//	inputs: Arbitrary number of tensors with arbitrary data types
+//
+//	serialized_remote_fused_graph_execute_info: Serialized protocol buffer
+// of RemoteFusedGraphExecuteInfo which contains graph specifications.
+//
+// Returns Arbitrary number of tensors with arbitrary data types
+func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.DataType, serialized_remote_fused_graph_execute_info string) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"Toutputs": Toutputs, "serialized_remote_fused_graph_execute_info": serialized_remote_fused_graph_execute_info}
+	opspec := tf.OpSpec{
+		Type: "RemoteFusedGraphExecute",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
 		},
 		Attrs: attrs,
 	}
@@ -10018,177 +12368,280 @@ func CTCBeamSearchDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Out
 	}
 	var idx int
 	var err error
-	if decoded_indices, idx, err = makeOutputList(op, idx, "decoded_indices"); err != nil {
-		scope.UpdateErr("CTCBeamSearchDecoder", err)
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("RemoteFusedGraphExecute", err)
 		return
 	}
-	if decoded_values, idx, err = makeOutputList(op, idx, "decoded_values"); err != nil {
-		scope.UpdateErr("CTCBeamSearchDecoder", err)
-		return
-	}
-	if decoded_shape, idx, err = makeOutputList(op, idx, "decoded_shape"); err != nil {
-		scope.UpdateErr("CTCBeamSearchDecoder", err)
-		return
-	}
-	log_probability = op.Output(idx)
-	return decoded_indices, decoded_values, decoded_shape, log_probability
+	return outputs
 }
 
-// Computes the Cholesky decomposition of one or more square matrices.
+// Locks a mutex resource.  The output is the lock.  So long as the lock tensor
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices.
+// is alive, any other request to use `MutexLock` with this mutex will wait.
 //
-// The input has to be symmetric and positive definite. Only the lower-triangular
-// part of the input will be used for this operation. The upper-triangular part
-// will not be read.
+// This is particularly useful for creating a critical section when used in
+// conjunction with `MutexLockIdentity`:
 //
-// The output is a tensor of the same shape as the input
-// containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
+// ```python
 //
-// **Note**: The gradient computation on GPU is faster for large matrices but
-// not for large batch dimensions when the submatrices are small. In this
-// case it might be faster to use the CPU.
+// mutex = mutex_v2(
+//   shared_name=handle_name, container=container, name=name)
+//
+// def execute_in_critical_section(fn, *args, **kwargs):
+//   lock = gen_resource_variable_ops.mutex_lock(mutex)
+//
+//   with ops.control_dependencies([lock]):
+//     r = fn(*args, **kwargs)
+//
+//   with ops.control_dependencies(nest.flatten(r)):
+//     with ops.colocate_with(mutex):
+//       ensure_lock_exists = mutex_lock_identity(lock)
+//
+//     # Make sure that if any element of r is accessed, all of
+//     # them are executed together.
+//     r = nest.map_structure(tf.identity, r)
+//
+//   with ops.control_dependencies([ensure_lock_exists]):
+//     return nest.map_structure(tf.identity, r)
+// ```
+//
+// While `fn` is running in the critical section, no other functions which wish to
+// use this critical section may run.
+//
+// Often the use case is that two executions of the same graph, in parallel,
+// wish to run `fn`; and we wish to ensure that only one of them executes
+// at a time.  This is especially important if `fn` modifies one or more
+// variables at a time.
+//
+// It is also useful if two separate functions must share a resource, but we
+// wish to ensure the usage is exclusive.
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
+//	mutex: The mutex resource to lock.
 //
-// Returns Shape is `[..., M, M]`.
-func Cholesky(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns A tensor that keeps a shared pointer to a lock on the mutex;
+// when the Tensor is destroyed, the use count on the shared pointer is decreased
+// by 1.  When it reaches 0, the lock is released.
+func MutexLock(scope *Scope, mutex tf.Output) (mutex_lock tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Cholesky",
+		Type: "MutexLock",
 		Input: []tf.Input{
-			input,
+			mutex,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DepthwiseConv2dNativeBackpropInputAttr is an optional argument to DepthwiseConv2dNativeBackpropInput.
-type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
-
-// DepthwiseConv2dNativeBackpropInputDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dNativeBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// DepthwiseConv2dNativeBackpropInputDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of depthwise convolution with respect to the input.
+// Initializes the multi device iterator with the given dataset.
 //
 // Arguments:
-//	input_sizes: An integer vector representing the shape of `input`, based
-// on `data_format`.  For example, if `data_format` is 'NHWC' then
-//  `input` is a 4-D `[batch, height, width, channels]` tensor.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
-//	padding: The type of padding algorithm to use.
+//	dataset: Dataset to be iterated upon.
+//	multi_device_iterator: A MultiDeviceIteratorResource.
+//	max_buffer_size: The maximum size of the host side per device buffer to keep.
 //
-// Returns 4-D with shape according to `data_format`.  For example, if
-// `data_format` is 'NHWC', output shape is `[batch, in_height,
-// in_width, in_channels]`.  Gradient w.r.t. the input of the
-// convolution.
-func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropInputAttr) (output tf.Output) {
+// Returns An int64 indicating which incarnation of the MultiDeviceIterator
+// is running.
+func MultiDeviceIteratorInit(scope *Scope, dataset tf.Output, multi_device_iterator tf.Output, max_buffer_size tf.Output) (incarnation_id tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "MultiDeviceIteratorInit",
+		Input: []tf.Input{
+			dataset, multi_device_iterator, max_buffer_size,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
+type QuantizedRelu6Attr func(optionalAttr)
+
+// QuantizedRelu6OutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
+//
+// Arguments:
+//
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropInput",
+		Type: "QuantizedRelu6",
 		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
+			features, min_features, max_features,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// This op consumes a lock created by `MutexLock`.
+//
+// This op exists to consume a tensor created by `MutexLock` (other than
+// direct control dependencies).  It should be the only that consumes the tensor,
+// and will raise an error if it is not.  Its only purpose is to keep the
+// mutex lock tensor alive until it is consumed by this op.
+//
+// **NOTE**: This operation must run on the same device as its input.  This may
+// be enforced via the `colocate_with` mechanism.
+//
+// Arguments:
+//	mutex_lock: A tensor returned by `MutexLock`.
+//
+// Returns the created operation.
+func ConsumeMutexLock(scope *Scope, mutex_lock tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ConsumeMutexLock",
+		Input: []tf.Input{
+			mutex_lock,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// MutexV2Attr is an optional argument to MutexV2.
+type MutexV2Attr func(optionalAttr)
+
+// MutexV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this variable is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutexV2Container(value string) MutexV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MutexV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this variable is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func MutexV2SharedName(value string) MutexV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a Mutex resource that can be locked by `MutexLock`.
+//
+// Returns The mutex resource.
+func MutexV2(scope *Scope, optional ...MutexV2Attr) (resource tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MutexV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Divides sparse updates into the variable referenced by `resource`.
+// Selects elements from `x` or `y`, depending on `condition`.
 //
-// This operation computes
+// The `x`, and `y` tensors must all have the same shape, and the
+// output will also have that shape.
 //
-//     # Scalar indices
-//     ref[indices, ...] /= updates[...]
+// The `condition` tensor must be a scalar if `x` and `y` are scalars.
+// If `x` and `y` are vectors or higher rank, then `condition` must be either a
+// scalar, a vector with size matching the first dimension of `x`, or must have
+// the same shape as `x`.
 //
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] /= updates[i, ...]
+// The `condition` tensor acts as a mask that chooses, based on the value at each
+// element, whether the corresponding element / row in the output should be
+// taken from `x` (if true) or `y` (if false).
 //
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] /= updates[i, ..., j, ...]
+// If `condition` is a vector and `x` and `y` are higher rank matrices, then
+// it chooses which row (outer dimension) to copy from `x` and `y`.
+// If `condition` has the same shape as `x` and `y`, then it chooses which
+// element to copy from `x` and `y`.
 //
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions multiply.
+// For example:
 //
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+// ```python
+// # 'condition' tensor is [[True,  False]
+// #                        [False, True]]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e)  # => [[1, 6], [7, 4]]
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
+//
+// # 'condition' tensor is [True, False]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e) ==> [[1, 2],
+//                              [7, 8]]
+//
+// ```
 //
 // Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
 //
-// Returns the created operation.
-func ResourceScatterDiv(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+//	x: = A `Tensor` which may have the same shape as `condition`.
+// If `condition` is rank 1, `x` may have higher rank,
+// but its first dimension must match the size of `condition`.
+//	y: = A `Tensor` with the same type and shape as `x`.
+//
+// Returns = A `Tensor` with the same type and shape as `x` and `y`.
+func Select(scope *Scope, condition tf.Output, x tf.Output, y tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterDiv",
+		Type: "Select",
 		Input: []tf.Input{
-			resource, indices, updates,
+			condition, x, y,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns x + y element-wise.
+// Returns x // y element-wise.
 //
-// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+// *NOTE*: `FloorDiv` supports broadcasting. More about broadcasting
 // [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+func FloorDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Add",
+		Type: "FloorDiv",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -10197,47 +12650,127 @@ func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// Splits a tensor into a list.
+// LoadTPUEmbeddingProximalAdagradParametersAttr is an optional argument to LoadTPUEmbeddingProximalAdagradParameters.
+type LoadTPUEmbeddingProximalAdagradParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingProximalAdagradParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// list[i] corresponds to lengths[i] tensors from the input tensor.
-// The tensor must have rank at least 1 and contain exactly sum(lengths) elements.
-//
-// tensor: The input tensor.
-// element_shape: A shape compatible with that of elements in the tensor.
-// lengths: Vector of sizes of the 0th dimension of tensors in the list.
-// output_handle: The list.
-func TensorListSplit(scope *Scope, tensor tf.Output, element_shape tf.Output, lengths tf.Output) (output_handle tf.Output) {
-	if scope.Err() != nil {
-		return
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingProximalAdagradParametersTableId(value int64) LoadTPUEmbeddingProximalAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "TensorListSplit",
-		Input: []tf.Input{
-			tensor, element_shape, lengths,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Creates a dataset that emits the lines of one or more text files.
+// LoadTPUEmbeddingProximalAdagradParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingProximalAdagradParametersTableName(value string) LoadTPUEmbeddingProximalAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load proximal Adagrad embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
 // Arguments:
-//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
-// read.
-//	compression_type: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-//	buffer_size: A scalar containing the number of bytes to buffer.
-func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
+//	parameters: Value of parameters used in the proximal Adagrad optimization algorithm.
+//	accumulators: Value of accumulators used in the proximal Adagrad optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingProximalAdagradParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingProximalAdagradParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingProximalAdagradParameters",
+		Input: []tf.Input{
+			parameters, accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Assigns sparse updates to the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] = updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] = updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TextLineDataset",
+		Type: "ResourceScatterUpdate",
 		Input: []tf.Input{
-			filenames, compression_type, buffer_size,
+			resource, indices, updates,
 		},
 	}
+	return scope.AddOperation(opspec)
+}
+
+// SerializeSparseAttr is an optional argument to SerializeSparse.
+type SerializeSparseAttr func(optionalAttr)
+
+// SerializeSparseOutType sets the optional out_type attribute to value.
+//
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeSparseOutType(value tf.DataType) SerializeSparseAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Serialize a `SparseTensor` into a `[3]` `Tensor` object.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeSparseAttr) (serialized_sparse tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SerializeSparse",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
+		Attrs: attrs,
+	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
@@ -10330,10 +12863,1149 @@ func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output,
 	return op.Output(0)
 }
 
-// MaxPoolGradGradAttr is an optional argument to MaxPoolGradGrad.
-type MaxPoolGradGradAttr func(optionalAttr)
+// Returns the index of a data point that should be added to the seed set.
+//
+// Entries in distances are assumed to be squared distances of candidate points to
+// the already sampled centers in the seed set. The op constructs one Markov chain
+// of the k-MC^2 algorithm and returns the index of one candidate point to be added
+// as an additional cluster center.
+//
+// Arguments:
+//	distances: Vector with squared distances to the closest previously sampled cluster center
+// for each candidate point.
+//	seed: Scalar. Seed for initializing the random number generator.
+//
+// Returns Scalar with the index of the sampled point.
+func KMC2ChainInitialization(scope *Scope, distances tf.Output, seed tf.Output) (index tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "KMC2ChainInitialization",
+		Input: []tf.Input{
+			distances, seed,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// MaxPoolGradGradDataFormat sets the optional data_format attribute to value.
+// Subtracts sparse updates from the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] -= updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] -= updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions add.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterSub(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterSub",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes square root of x element-wise.
+//
+// I.e., \\(y = \sqrt{x} = x^{1/2}\\).
+func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sqrt",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StringSplitAttr is an optional argument to StringSplit.
+type StringSplitAttr func(optionalAttr)
+
+// StringSplitSkipEmpty sets the optional skip_empty attribute to value.
+//
+// value: A `bool`. If `True`, skip the empty strings from the result.
+// If not specified, defaults to true
+func StringSplitSkipEmpty(value bool) StringSplitAttr {
+	return func(m optionalAttr) {
+		m["skip_empty"] = value
+	}
+}
+
+// Split elements of `input` based on `delimiter` into a `SparseTensor`.
+//
+// Let N be the size of source (typically N will be the batch size). Split each
+// element of `input` based on `delimiter` and return a `SparseTensor`
+// containing the splitted tokens. Empty tokens are ignored.
+//
+// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
+//  empty string, each element of `input` is split into individual single-byte
+//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
+//  every character of `delimiter` is a potential split point.
+//
+// For example:
+//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
+//   will be
+//
+//   indices = [0, 0;
+//              0, 1;
+//              1, 0;
+//              1, 1;
+//              1, 2]
+//   shape = [2, 3]
+//   values = ['hello', 'world', 'a', 'b', 'c']
+//
+// Arguments:
+//	input: 1-D. Strings to split.
+//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
+//
+// Returns A dense matrix of int64 representing the indices of the sparse tensor.A vector of strings corresponding to the splited values.a length-2 vector of int64 representing the shape of the sparse
+// tensor, where the first value is N and the second value is the maximum number
+// of tokens in a single input entry.
+func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ...StringSplitAttr) (indices tf.Output, values tf.Output, shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringSplit",
+		Input: []tf.Input{
+			input, delimiter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Computes gradients for SparseSegmentMean.
+//
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
+//
+// Arguments:
+//	grad: gradient propagated to the SparseSegmentMean op.
+//	indices: indices passed to the corresponding SparseSegmentMean op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
+func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentMeanGrad",
+		Input: []tf.Input{
+			grad, indices, segment_ids, output_dim0,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds a value to the current value of a variable.
+//
+// Any ReadVariableOp with a control dependency on this op is guaranteed to
+// see the incremented value or a subsequent newer one.
+//
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
+//
+// Returns the created operation.
+func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AssignAddVariableOp",
+		Input: []tf.Input{
+			resource, value,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Reads the value of a variable.
+//
+// The tensor returned by this operation is immutable.
+//
+// The value returned by this operation is guaranteed to be influenced by all the
+// writes on which this operation depends directly or indirectly, and to not be
+// influenced by any of the writes which depend directly or indirectly on this
+// operation.
+//
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	dtype: the dtype of the value.
+func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "ReadVariableOp",
+		Input: []tf.Input{
+			resource,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradient of morphological 2-D dilation with respect to the input.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
+//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
+// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
+// Must be: `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape `[batch, in_height, in_width, depth]`.
+func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (in_backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "Dilation2DBackpropInput",
+		Input: []tf.Input{
+			input, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Eagerly executes a python function to compute func(input)->output. The
+//
+// semantics of the input, output, and attributes are the same as those for
+// PyFunc.
+func EagerPyFunc(scope *Scope, input []tf.Output, token string, Tout []tf.DataType) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"token": token, "Tout": Tout}
+	opspec := tf.OpSpec{
+		Type: "EagerPyFunc",
+		Input: []tf.Input{
+			tf.OutputList(input),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("EagerPyFunc", err)
+		return
+	}
+	return output
+}
+
+// RetrieveTPUEmbeddingCenteredRMSPropParametersAttr is an optional argument to RetrieveTPUEmbeddingCenteredRMSPropParameters.
+type RetrieveTPUEmbeddingCenteredRMSPropParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingCenteredRMSPropParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingCenteredRMSPropParametersTableId(value int64) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingCenteredRMSPropParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingCenteredRMSPropParametersTableName(value string) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve centered RMSProp embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the centered RMSProp optimization algorithm.Parameter ms updated by the centered RMSProp optimization algorithm.Parameter mom updated by the centered RMSProp optimization algorithm.Parameter mg updated by the centered RMSProp optimization algorithm.
+func RetrieveTPUEmbeddingCenteredRMSPropParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingCenteredRMSPropParametersAttr) (parameters tf.Output, ms tf.Output, mom tf.Output, mg tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingCenteredRMSPropParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// Adds `bias` to `value`.
+//
+// This is a deprecated version of BiasAdd and will be soon removed.
+//
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
+//
+// Arguments:
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
+//
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BiasAddV1",
+		Input: []tf.Input{
+			value, bias,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the absolute value of a tensor.
+//
+// Given a tensor `x`, this operation returns a tensor containing the absolute
+// value of each element in `x`. For example, if x is an input element and y is
+// an output element, this operation computes \\(y = |x|\\).
+func Abs(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Abs",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
+type DataFormatDimMapAttr func(optionalAttr)
+
+// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
+//
+// value: source data format.
+// If not specified, defaults to "NHWC"
+func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
+	return func(m optionalAttr) {
+		m["src_format"] = value
+	}
+}
+
+// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
+//
+// value: destination data format.
+// If not specified, defaults to "NCHW"
+func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
+	return func(m optionalAttr) {
+		m["dst_format"] = value
+	}
+}
+
+// Returns the dimension index in the destination data format given the one in
+//
+// the source data format.
+//
+// Arguments:
+//	x: A Tensor with each element as a dimension index in source data format.
+// Must be in the range [-4, 4).
+//
+// Returns A Tensor with each element as a dimension index in destination data format.
+func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DataFormatDimMap",
+		Input: []tf.Input{
+			x,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Writes the given dataset to the given file using the TFRecord format.
+//
+// Arguments:
+//	input_dataset: A variant tensor representing the dataset to write.
+//	filename: A scalar string tensor representing the filename to use.
+//	compression_type: A scalar string tensor containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+//
+// Returns the created operation.
+func ExperimentalDatasetToTFRecord(scope *Scope, input_dataset tf.Output, filename tf.Output, compression_type tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalDatasetToTFRecord",
+		Input: []tf.Input{
+			input_dataset, filename, compression_type,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Produces the average pool of the input tensor for quantized types.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, height, width, channels]`.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	ksize: The size of the window for each dimension of the input tensor.
+// The length must be 4 to match the number of dimensions of the input.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.  The length must be 4 to match the number of dimensions of the input.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedAvgPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "QuantizedAvgPool",
+		Input: []tf.Input{
+			input, min_input, max_input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// A placeholder op that passes through `input` when its output is not fed.
+//
+// Arguments:
+//	input: The default value to produce when `output` is not fed.
+//	shape: The (possibly partial) shape of the tensor.
+//
+// Returns A placeholder tensor that defaults to `input` if it is not fed.
+func PlaceholderWithDefault(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shape": shape}
+	opspec := tf.OpSpec{
+		Type: "PlaceholderWithDefault",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns locations of nonzero / true values in a tensor.
+//
+// This operation returns the coordinates of true elements in `condition`. The
+// coordinates are returned in a 2-D tensor where the first dimension (rows)
+// represents the number of true elements, and the second dimension (columns)
+// represents the coordinates of the true elements. Keep in mind, the shape of
+// the output tensor can vary depending on how many true values there are in
+// `condition`. Indices are output in row-major order.
+//
+// For example:
+//
+// ```
+// # 'input' tensor is [[True, False]
+// #                    [True, False]]
+// # 'input' has two true values, so output has two coordinates.
+// # 'input' has rank of 2, so coordinates have two indices.
+// where(input) ==> [[0, 0],
+//                   [1, 0]]
+//
+// # `condition` tensor is [[[True, False]
+// #                     [True, False]]
+// #                    [[False, True]
+// #                     [False, True]]
+// #                    [[False, False]
+// #                     [False, True]]]
+// # 'input' has 5 true values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
+//
+// # `condition` tensor is [[[1.5,  0.0]
+// #                     [-0.5, 0.0]]
+// #                    [[0.0,  0.25]
+// #                     [0.0,  0.75]]
+// #                    [[0.0,  0.0]
+// #                     [0.0,  0.01]]]
+// # 'input' has 5 nonzero values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
+//
+// # `condition` tensor is [[[1.5 + 0.0j, 0.0  + 0.0j]
+// #                     [0.0 + 0.5j, 0.0  + 0.0j]]
+// #                    [[0.0 + 0.0j, 0.25 + 1.5j]
+// #                     [0.0 + 0.0j, 0.75 + 0.0j]]
+// #                    [[0.0 + 0.0j, 0.0  + 0.0j]
+// #                     [0.0 + 0.0j, 0.01 + 0.0j]]]
+// # 'input' has 5 nonzero magnitude values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
+// ```
+func Where(scope *Scope, condition tf.Output) (index tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Where",
+		Input: []tf.Input{
+			condition,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DecodeJpegAttr is an optional argument to DecodeJpeg.
+type DecodeJpegAttr func(optionalAttr)
+
+// DecodeJpegChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeJpegChannels(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// DecodeJpegRatio sets the optional ratio attribute to value.
+//
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeJpegRatio(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
+	}
+}
+
+// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+//
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
+	}
+}
+
+// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+//
+// value: If true try to recover an image from truncated input.
+// If not specified, defaults to false
+func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["try_recover_truncated"] = value
+	}
+}
+
+// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+//
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["acceptable_fraction"] = value
+	}
+}
+
+// DecodeJpegDctMethod sets the optional dct_method attribute to value.
+//
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeJpegDctMethod(value string) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["dct_method"] = value
+	}
+}
+
+// Decode a JPEG-encoded image to a uint8 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
+//
+//
+// This op also supports decoding PNGs and non-animated GIFs since the interface is
+// the same, though it is cleaner to use `tf.image.decode_image`.
+//
+// Arguments:
+//	contents: 0-D.  The JPEG-encoded image.
+//
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeJpeg",
+		Input: []tf.Input{
+			contents,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ExperimentalThreadPoolHandleAttr is an optional argument to ExperimentalThreadPoolHandle.
+type ExperimentalThreadPoolHandleAttr func(optionalAttr)
+
+// ExperimentalThreadPoolHandleMaxIntraOpParallelism sets the optional max_intra_op_parallelism attribute to value.
+//
+// value: The maximum degree of parallelism to use within operations that execute on this
+// threadpool.
+// If not specified, defaults to 1
+func ExperimentalThreadPoolHandleMaxIntraOpParallelism(value int64) ExperimentalThreadPoolHandleAttr {
+	return func(m optionalAttr) {
+		m["max_intra_op_parallelism"] = value
+	}
+}
+
+// ExperimentalThreadPoolHandleContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func ExperimentalThreadPoolHandleContainer(value string) ExperimentalThreadPoolHandleAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// ExperimentalThreadPoolHandleSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func ExperimentalThreadPoolHandleSharedName(value string) ExperimentalThreadPoolHandleAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
+//
+// Arguments:
+//	num_threads: The number of threads in the thread pool.
+//	display_name: A human-readable name for the threads that may be visible in some
+// visualizations.
+// threadpool.
+//
+// Returns A resource that can be consumed by one or more ExperimentalThreadPoolDataset
+// ops.
+func ExperimentalThreadPoolHandle(scope *Scope, num_threads int64, display_name string, optional ...ExperimentalThreadPoolHandleAttr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_threads": num_threads, "display_name": display_name}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalThreadPoolHandle",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DenseToSparseSetOperationAttr is an optional argument to DenseToSparseSetOperation.
+type DenseToSparseSetOperationAttr func(optionalAttr)
+
+// DenseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func DenseToSparseSetOperationValidateIndices(value bool) DenseToSparseSetOperationAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Applies set operation along last dimension of `Tensor` and `SparseTensor`.
+//
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+//
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set2`
+// indices.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
+//
+// Arguments:
+//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the
+// max set size across `n-1` dimensions.
+//
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...DenseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"set_operation": set_operation}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DenseToSparseSetOperation",
+		Input: []tf.Input{
+			set1, set2_indices, set2_values, set2_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// RandomPoissonAttr is an optional argument to RandomPoisson.
+type RandomPoissonAttr func(optionalAttr)
+
+// RandomPoissonSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func RandomPoissonSeed(value int64) RandomPoissonAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomPoissonSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func RandomPoissonSeed2(value int64) RandomPoissonAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Use RandomPoissonV2 instead.
+//
+// DEPRECATED at GraphDef version 25: Replaced by RandomPoissonV2
+func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomPoisson",
+		Input: []tf.Input{
+			shape, rate,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Greedily selects a subset of bounding boxes in descending order of score,
+//
+// pruning away boxes that have high overlaps
+// with previously selected boxes.  Bounding boxes with score less than
+// `score_threshold` are removed. N-by-n overlap values are supplied as square matrix,
+// which allows for defining a custom overlap criterium (eg. intersection over union,
+// intersection over area, etc.).
+//
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//
+//   selected_indices = tf.image.non_max_suppression_with_overlaps(
+//       overlaps, scores, max_output_size, overlap_threshold, score_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
+//
+// Arguments:
+//	overlaps: A 2-D float tensor of shape `[num_boxes, num_boxes]` representing
+// the n-by-n box overlap values.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	overlap_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too.
+//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
+// boxes based on score.
+//
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppressionWithOverlaps(scope *Scope, overlaps tf.Output, scores tf.Output, max_output_size tf.Output, overlap_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NonMaxSuppressionWithOverlaps",
+		Input: []tf.Input{
+			overlaps, scores, max_output_size, overlap_threshold, score_threshold,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Pop the element at the top of the stack.
+//
+// Arguments:
+//	handle: The handle to a stack.
+//	elem_type: The type of the elem that is popped.
+//
+// Returns The tensor that is popped from the top of the stack.
+func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"elem_type": elem_type}
+	opspec := tf.OpSpec{
+		Type: "StackPopV2",
+		Input: []tf.Input{
+			handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
+//
+// if < 0, `scale * features` otherwise.
+//
+// To be used together with
+// `initializer = tf.variance_scaling_initializer(factor=1.0, mode='FAN_IN')`.
+// For correct dropout, use `tf.contrib.nn.alpha_dropout`.
+//
+// See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
+func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Selu",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Fills empty rows in the input 2-D `SparseTensor` with a default value.
+//
+// The input `SparseTensor` is represented via the tuple of inputs
+// (`indices`, `values`, `dense_shape`).  The output `SparseTensor` has the
+// same `dense_shape` but with indices `output_indices` and values
+// `output_values`.
+//
+// This op inserts a single entry for every row that doesn't have any values.
+// The index is created as `[row, 0, ..., 0]` and the inserted value
+// is `default_value`.
+//
+// For example, suppose `sp_input` has shape `[5, 6]` and non-empty values:
+//
+//     [0, 1]: a
+//     [0, 3]: b
+//     [2, 0]: c
+//     [3, 1]: d
+//
+// Rows 1 and 4 are empty, so the output will be of shape `[5, 6]` with values:
+//
+//     [0, 1]: a
+//     [0, 3]: b
+//     [1, 0]: default_value
+//     [2, 0]: c
+//     [3, 1]: d
+//     [4, 0]: default_value
+//
+// The output `SparseTensor` will be in row-major order and will have the
+// same shape as the input.
+//
+// This op also returns an indicator vector shaped `[dense_shape[0]]` such that
+//
+//     empty_row_indicator[i] = True iff row i was an empty row.
+//
+// And a reverse index map vector shaped `[indices.shape[0]]` that is used during
+// backpropagation,
+//
+//     reverse_index_map[j] = out_j s.t. indices[j, :] == output_indices[out_j, :]
+//
+// Arguments:
+//	indices: 2-D. the indices of the sparse tensor.
+//	values: 1-D. the values of the sparse tensor.
+//	dense_shape: 1-D. the shape of the sparse tensor.
+//	default_value: 0-D. default value to insert into location `[row, 0, ..., 0]`
+//   for rows missing from the input sparse tensor.
+// output indices: 2-D. the indices of the filled sparse tensor.
+//
+// Returns 1-D. the values of the filled sparse tensor.1-D. whether the dense row was missing in the
+// input sparse tensor.1-D. a map from the input indices to the output indices.
+func SparseFillEmptyRows(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, default_value tf.Output) (output_indices tf.Output, output_values tf.Output, empty_row_indicator tf.Output, reverse_index_map tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseFillEmptyRows",
+		Input: []tf.Input{
+			indices, values, dense_shape, default_value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// Computes reciprocal of square root of x element-wise.
+//
+// I.e., \\(y = 1 / \sqrt{x}\\).
+func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Rsqrt",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
+type AddManySparseToTensorsMapAttr func(optionalAttr)
+
+// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
+//
+// value: The container name for the `SparseTensorsMap` created by this op.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+//
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
+//
+// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`, where
+//
+// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
+//
+// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
+// having a first `sparse_indices` column taking values between `[0, N)`, where
+// the minibatch size `N == sparse_shape[0]`.
+//
+// The input `SparseTensor` must have rank `R` greater than 1, and the first
+// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The stored
+// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
+// will have rank `R-1`.
+//
+// The `SparseTensor` values can then be read out as part of a minibatch by passing
+// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddManySparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+// The minibatch size `N == sparse_shape[0]`.
+//
+// Returns 1-D.  The handles of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.  Shape: `[N]`.
+func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AddManySparseToTensorsMap",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Makes a new iterator from the given `dataset` and stores it in `iterator`.
+//
+// This operation may be executed multiple times. Each execution will reset the
+// iterator in `iterator` to the first element of `dataset`.
+//
+// Returns the created operation.
+func MakeIterator(scope *Scope, dataset tf.Output, iterator tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MakeIterator",
+		Input: []tf.Input{
+			dataset, iterator,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns element-wise integer closest to x.
+//
+// If the result is midway between two representable values,
+// the even representable is chosen.
+// For example:
+//
+// ```
+// rint(-1.5) ==> -2.0
+// rint(0.5000001) ==> 1.0
+// rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
+// ```
+func Rint(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Rint",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPoolGradGradV2Attr is an optional argument to MaxPoolGradGradV2.
+type MaxPoolGradGradV2Attr func(optionalAttr)
+
+// MaxPoolGradGradV2DataFormat sets the optional data_format attribute to value.
 //
 // value: Specify the data format of the input and output data. With the
 // default format "NHWC", the data is stored in the order of:
@@ -10341,7 +14013,7 @@ type MaxPoolGradGradAttr func(optionalAttr)
 // Alternatively, the format could be "NCHW", the data storage order of:
 //     [batch, in_channels, in_height, in_width].
 // If not specified, defaults to "NHWC"
-func MaxPoolGradGradDataFormat(value string) MaxPoolGradGradAttr {
+func MaxPoolGradGradV2DataFormat(value string) MaxPoolGradGradV2Attr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
@@ -10359,7 +14031,2252 @@ func MaxPoolGradGradDataFormat(value string) MaxPoolGradGradAttr {
 //	padding: The type of padding algorithm to use.
 //
 // Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradAttr) (output tf.Output) {
+func MaxPoolGradGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradGradV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolGradGradV2",
+		Input: []tf.Input{
+			orig_input, orig_output, grad, ksize, strides,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a Dataset that returns pseudorandom numbers.
+//
+// Arguments:
+//	seed: A scalar seed for the random number generator. If either seed or
+// seed2 is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
+//
+//
+func ExperimentalRandomDataset(scope *Scope, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalRandomDataset",
+		Input: []tf.Input{
+			seed, seed2,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
+type AddSparseToTensorsMapAttr func(optionalAttr)
+
+// AddSparseToTensorsMapContainer sets the optional container attribute to value.
+//
+// value: The container name for the `SparseTensorsMap` created by this op.
+// If not specified, defaults to ""
+func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+//
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
+// If not specified, defaults to ""
+func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
+//
+// A `SparseTensor` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`.
+//
+// This operator takes the given `SparseTensor` and adds it to a container
+// object (a `SparseTensorsMap`).  A unique key within this container is generated
+// in the form of an `int64`, and this is the value that is returned.
+//
+// The `SparseTensor` can then be read out as part of a minibatch by passing
+// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddSparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+//
+// Returns 0-D.  The handle of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.
+func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AddSparseToTensorsMap",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// UniformCandidateSamplerAttr is an optional argument to UniformCandidateSampler.
+type UniformCandidateSamplerAttr func(optionalAttr)
+
+// UniformCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func UniformCandidateSamplerSeed(value int64) UniformCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// UniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func UniformCandidateSamplerSeed2(value int64) UniformCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a uniform distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func UniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...UniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UniformCandidateSampler",
+		Input: []tf.Input{
+			true_classes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns the element-wise max of two SparseTensors.
+//
+// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
+//
+// Arguments:
+//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, in the canonical lexicographic ordering.
+//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
+//	a_shape: 1-D.  Shape of the input SparseTensor.
+//	b_indices: counterpart to `a_indices` for the other operand.
+//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
+//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+//
+// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
+func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSparseMaximum",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
+type QuantizedConv2DAttr func(optionalAttr)
+
+// QuantizedConv2DOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// QuantizedConv2DDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 2D convolution given quantized 4D input and filter tensors.
+//
+// The inputs are quantized tensors where the lowest value represents the real
+// number of the associated minimum, and the highest represents the maximum.
+// This means that you can only interpret the quantized output in the same way, by
+// taking the returned minimum and maximum values into account.
+//
+// Arguments:
+//
+//	filter: filter's input_depth dimension must match input's depth dimensions.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_filter: The float value that the lowest quantized filter value represents.
+//	max_filter: The float value that the highest quantized filter value represents.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedConv2D",
+		Input: []tf.Input{
+			input, filter, min_input, max_input, min_filter, max_filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// ExtractJpegShapeAttr is an optional argument to ExtractJpegShape.
+type ExtractJpegShapeAttr func(optionalAttr)
+
+// ExtractJpegShapeOutputType sets the optional output_type attribute to value.
+//
+// value: (Optional) The output type of the operation (int32 or int64).
+// Defaults to int32.
+// If not specified, defaults to DT_INT32
+func ExtractJpegShapeOutputType(value tf.DataType) ExtractJpegShapeAttr {
+	return func(m optionalAttr) {
+		m["output_type"] = value
+	}
+}
+
+// Extract the shape information of a JPEG-encoded image.
+//
+// This op only parses the image header, so it is much faster than DecodeJpeg.
+//
+// Arguments:
+//	contents: 0-D. The JPEG-encoded image.
+//
+// Returns 1-D. The image shape with format [height, width, channels].
+func ExtractJpegShape(scope *Scope, contents tf.Output, optional ...ExtractJpegShapeAttr) (image_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ExtractJpegShape",
+		Input: []tf.Input{
+			contents,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Applies softmax to a batched N-D `SparseTensor`.
+//
+// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
+// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
+//
+// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
+// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
+// zero elements do not participate*.  Specifically, the algorithm is equivalent
+// to the following:
+//
+//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
+//       with shape `[B, C]`, along the size-C dimension;
+//   (2) Masks out the original implicitly-zero locations;
+//   (3) Renormalizes the remaining elements.
+//
+// Hence, the `SparseTensor` result has exactly the same non-zero indices and
+// shape.
+//
+// Arguments:
+//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
+// SparseTensor, in canonical ordering.
+//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//
+// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
+func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSoftmax",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Component-wise multiplies a SparseTensor by a dense Tensor.
+//
+// The output locations corresponding to the implicitly zero elements in the sparse
+// tensor will be zero (i.e., will not take up storage space), regardless of the
+// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
+//
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
+//
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseDenseCwiseMul",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape, dense,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Conv3DBackpropInputV2Attr is an optional argument to Conv3DBackpropInputV2.
+type Conv3DBackpropInputV2Attr func(optionalAttr)
+
+// Conv3DBackpropInputV2DataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv3DBackpropInputV2Dilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the input.
+//
+// Arguments:
+//	input_sizes: An integer vector representing the tensor shape of `input`,
+// where `input` is a 5-D
+// `[batch, depth, rows, cols, in_channels]` tensor.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv3DBackpropInputV2",
+		Input: []tf.Input{
+			input_sizes, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds up a SparseTensor and a dense Tensor, using these special rules:
+//
+// (1) Broadcasts the dense side to have the same shape as the sparse side, if
+//     eligible;
+// (2) Then, only the dense values pointed to by the indices of the SparseTensor
+//     participate in the cwise addition.
+//
+// By these rules, the result is a logical SparseTensor with exactly the same
+// indices and shape, but possibly with different non-zero values.  The output of
+// this Op is the resultant non-zero values.
+//
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseDenseCwiseAdd",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape, dense,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Component-wise divides a SparseTensor by a dense Tensor.
+//
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
+//
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseDenseCwiseDiv",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape, dense,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Performs a padding as a preprocess during a convolution.
+//
+// Similar to FusedResizeAndPadConv2d, this op allows for an optimized
+// implementation where the spatial padding transformation stage is fused with the
+// im2col lookup, but in this case without the bilinear filtering required for
+// resizing. Fusing the padding prevents the need to write out the intermediate
+// results as whole tensors, reducing memory pressure, and we can get some latency
+// gains by merging the transformation calculations.
+// The data_format attribute for Conv2D isn't supported by this op, and 'NHWC'
+// order is used instead.
+// Internally this op uses a single per-graph scratch buffer, which means that it
+// will block if multiple versions are being run in parallel. This is because this
+// operator is primarily an optimization to minimize memory usage.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`. Must be in the same order as the dimension specified with format.
+//	padding: The type of padding algorithm to use.
+func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "FusedPadConv2D",
+		Input: []tf.Input{
+			input, paddings, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the matrix square root of one or more square matrices:
+//
+// matmul(sqrtm(A), sqrtm(A)) = A
+//
+// The input matrix should be invertible. If the input matrix is real, it should
+// have no eigenvalues which are real and negative (pairs of complex conjugate
+// eigenvalues are allowed).
+//
+// The matrix square root is computed by first reducing the matrix to
+// quasi-triangular form with the real Schur decomposition. The square root
+// of the quasi-triangular matrix is then computed directly. Details of
+// the algorithm can be found in: Nicholas J. Higham, "Computing real
+// square roots of a real matrix", Linear Algebra Appl., 1987.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the matrix square root for all input submatrices `[..., :, :]`.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(scipy)
+// Equivalent to scipy.linalg.sqrtm
+// @end_compatibility
+func MatrixSquareRoot(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixSquareRoot",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseReduceMaxAttr is an optional argument to SparseReduceMax.
+type SparseReduceMaxAttr func(optionalAttr)
+
+// SparseReduceMaxKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceMaxKeepDims(value bool) SparseReduceMaxAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the max of elements across dimensions of a SparseTensor.
+//
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
+// instead of a sparse one.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+//
+// Returns `R-K`-D.  The reduced Tensor.
+func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReduceMax",
+		Input: []tf.Input{
+			input_indices, input_values, input_shape, reduction_axes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
+//
+// This Op does not require `a_indices` be sorted in standard lexicographic order.
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
+//	b: `ndims`-D Tensor.  With shape `a_shape`.
+func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseTensorDenseAdd",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reshapes a SparseTensor to represent values in a new dense shape.
+//
+// This operation has the same semantics as reshape on the represented dense
+// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
+//
+// If one component of `new_shape` is the special value -1, the size of that
+// dimension is computed so that the total dense size remains constant.  At
+// most one component of `new_shape` can be -1.  The number of dense elements
+// implied by `new_shape` must be the same as the number of dense elements
+// originally implied by `input_shape`.
+//
+// Reshaping does not affect the order of values in the SparseTensor.
+//
+// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
+// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
+// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
+// `output_shape` has length `R_out`.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
+// SparseTensor.
+//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
+//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
+//
+// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
+// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
+// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
+// filled in.
+func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReshape",
+		Input: []tf.Input{
+			input_indices, input_shape, new_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// DecodePngAttr is an optional argument to DecodePng.
+type DecodePngAttr func(optionalAttr)
+
+// DecodePngChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodePngChannels(value int64) DecodePngAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// DecodePngDtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_UINT8
+func DecodePngDtype(value tf.DataType) DecodePngAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Decode a PNG-encoded image to a uint8 or uint16 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the PNG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+// *   4: output an RGBA image.
+//
+// If needed, the PNG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// This op also supports decoding JPEGs and non-animated GIFs since the interface
+// is the same, though it is cleaner to use `tf.image.decode_image`.
+//
+// Arguments:
+//	contents: 0-D.  The PNG-encoded image.
+//
+// Returns 3-D with shape `[height, width, channels]`.
+func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodePng",
+		Input: []tf.Input{
+			contents,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reorders a SparseTensor into the canonical, row-major ordering.
+//
+// Note that by convention, all sparse ops preserve the canonical ordering along
+// increasing dimension number. The only time ordering can be violated is during
+// manual manipulation of the indices and values vectors to add entries.
+//
+// Reordering does not affect the shape of the SparseTensor.
+//
+// If the tensor has rank `R` and `N` non-empty values, `input_indices` has
+// shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//
+// Returns 2-D.  `N x R` matrix with the same indices as input_indices, but
+// in canonical row-major ordering.1-D.  `N` non-empty values corresponding to `output_indices`.
+func SparseReorder(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReorder",
+		Input: []tf.Input{
+			input_indices, input_values, input_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// The gradient operator for the SparseSlice op.
+//
+// This op takes in the upstream gradient w.r.t. non-empty values of
+// the sliced `SparseTensor`, and outputs the gradients w.r.t.
+// the non-empty values of input `SparseTensor`.
+//
+// Arguments:
+//	backprop_val_grad: 1-D. The gradient with respect to
+// the non-empty values of the sliced `SparseTensor`.
+//	input_indices: 2-D.  The `indices` of the input `SparseTensor`.
+//	input_start: 1-D. tensor represents the start of the slice.
+//	output_indices: 2-D.  The `indices` of the sliced `SparseTensor`.
+//
+// Returns 1-D. The gradient with respect to the non-empty values of input `SparseTensor`.
+func SparseSliceGrad(scope *Scope, backprop_val_grad tf.Output, input_indices tf.Output, input_start tf.Output, output_indices tf.Output) (val_grad tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSliceGrad",
+		Input: []tf.Input{
+			backprop_val_grad, input_indices, input_start, output_indices,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a tensor filled with a scalar value.
+//
+// This operation creates a tensor of shape `dims` and fills it with `value`.
+//
+// For example:
+//
+// ```
+// # Output tensor has shape [2, 3].
+// fill([2, 3], 9) ==> [[9, 9, 9]
+//                      [9, 9, 9]]
+// ```
+//
+// `tf.fill` differs from `tf.constant` in a few ways:
+//
+// *   `tf.fill` only supports scalar contents, whereas `tf.constant` supports
+//     Tensor values.
+// *   `tf.fill` creates an Op in the computation graph that constructs the actual
+//     Tensor value at runtime. This is in contrast to `tf.constant` which embeds
+//     the entire Tensor into the graph with a `Const` node.
+// *   Because `tf.fill` evaluates at graph runtime, it supports dynamic shapes
+//     based on other runtime Tensors, unlike `tf.constant`.
+//
+// Arguments:
+//	dims: 1-D. Represents the shape of the output tensor.
+//	value: 0-D (scalar). Value to fill the returned tensor.
+//
+// @compatibility(numpy)
+// Equivalent to np.full
+// @end_compatibility
+func Fill(scope *Scope, dims tf.Output, value tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Fill",
+		Input: []tf.Input{
+			dims, value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the maximum along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// This operator is similar to the unsorted segment sum operator found
+// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the maximum such that:
+//
+// \\(output_i = \max_{j...} data[j...]\\) where max is over tuples `j...` such
+// that `segment_ids[j...] == i`.
+//
+// If the maximum is empty for a given segment ID `i`, it outputs the smallest
+// possible value for the specific numeric type,
+// `output[i] = numeric_limits<T>::lowest()`.
+//
+// If the given segment ID `i` is negative, then the corresponding value is
+// dropped, and will not be included in the result.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
+// </div>
+//
+// For example:
+//
+// ``` python
+// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+// tf.unsorted_segment_max(c, tf.constant([0, 1, 0]), num_segments=2)
+// # ==> [[ 4,  3, 3, 4],
+// #       [5,  6, 7, 8]]
+// ```
+//
+//
+// Arguments:
+//
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnsortedSegmentMax",
+		Input: []tf.Input{
+			data, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Concatenates a list of `SparseTensor` along the specified dimension.
+//
+// Concatenation is with respect to the dense versions of these sparse tensors.
+// It is assumed that each input is a `SparseTensor` whose elements are ordered
+// along increasing dimension number.
+//
+// All inputs' shapes must match, except for the concat dimension.  The
+// `indices`, `values`, and `shapes` lists must have the same length.
+//
+// The output shape is identical to the inputs', except along the concat
+// dimension, where it is the sum of the inputs' sizes along that dimension.
+//
+// The output elements will be resorted to preserve the sort order along
+// increasing dimension number.
+//
+// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
+// values across all inputs. This is due to the need for an internal sort in
+// order to concatenate efficiently across an arbitrary dimension.
+//
+// For example, if `concat_dim = 1` and the inputs are
+//
+//     sp_inputs[0]: shape = [2, 3]
+//     [0, 2]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     sp_inputs[1]: shape = [2, 4]
+//     [0, 1]: "d"
+//     [0, 2]: "e"
+//
+// then the output will be
+//
+//     shape = [2, 7]
+//     [0, 2]: "a"
+//     [0, 4]: "d"
+//     [0, 5]: "e"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+// Graphically this is equivalent to doing
+//
+//     [    a] concat [  d e  ] = [    a   d e  ]
+//     [b c  ]        [       ]   [b c          ]
+//
+// Arguments:
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.  Non-empty values of each `SparseTensor`.
+//	shapes: 1-D.  Shapes of each `SparseTensor`.
+//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
+// where rank is the number of dimensions in each input `SparseTensor`.
+//
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"concat_dim": concat_dim}
+	opspec := tf.OpSpec{
+		Type: "SparseConcat",
+		Input: []tf.Input{
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// FixedUnigramCandidateSamplerAttr is an optional argument to FixedUnigramCandidateSampler.
+type FixedUnigramCandidateSamplerAttr func(optionalAttr)
+
+// FixedUnigramCandidateSamplerVocabFile sets the optional vocab_file attribute to value.
+//
+// value: Each valid line in this file (which should have a CSV-like format)
+// corresponds to a valid word ID. IDs are in sequential order, starting from
+// num_reserved_ids. The last entry in each line is expected to be a value
+// corresponding to the count or relative probability. Exactly one of vocab_file
+// and unigrams needs to be passed to this op.
+// If not specified, defaults to ""
+func FixedUnigramCandidateSamplerVocabFile(value string) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["vocab_file"] = value
+	}
+}
+
+// FixedUnigramCandidateSamplerDistortion sets the optional distortion attribute to value.
+//
+// value: The distortion is used to skew the unigram probability distribution.
+// Each weight is first raised to the distortion's power before adding to the
+// internal unigram distribution. As a result, distortion = 1.0 gives regular
+// unigram sampling (as defined by the vocab file), and distortion = 0.0 gives
+// a uniform distribution.
+// If not specified, defaults to 1
+func FixedUnigramCandidateSamplerDistortion(value float32) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["distortion"] = value
+	}
+}
+
+// FixedUnigramCandidateSamplerNumReservedIds sets the optional num_reserved_ids attribute to value.
+//
+// value: Optionally some reserved IDs can be added in the range [0,
+// ..., num_reserved_ids) by the users. One use case is that a special unknown
+// word token is used as ID 0. These IDs will have a sampling probability of 0.
+// If not specified, defaults to 0
+func FixedUnigramCandidateSamplerNumReservedIds(value int64) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["num_reserved_ids"] = value
+	}
+}
+
+// FixedUnigramCandidateSamplerNumShards sets the optional num_shards attribute to value.
+//
+// value: A sampler can be used to sample from a subset of the original range
+// in order to speed up the whole computation through parallelism. This parameter
+// (together with 'shard') indicates the number of partitions that are being
+// used in the overall computation.
+// If not specified, defaults to 1
+//
+// REQUIRES: value >= 1
+func FixedUnigramCandidateSamplerNumShards(value int64) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["num_shards"] = value
+	}
+}
+
+// FixedUnigramCandidateSamplerShard sets the optional shard attribute to value.
+//
+// value: A sampler can be used to sample from a subset of the original range
+// in order to speed up the whole computation through parallelism. This parameter
+// (together with 'num_shards') indicates the particular partition number of a
+// sampler op, when partitioning is being used.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func FixedUnigramCandidateSamplerShard(value int64) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["shard"] = value
+	}
+}
+
+// FixedUnigramCandidateSamplerUnigrams sets the optional unigrams attribute to value.
+//
+// value: A list of unigram counts or probabilities, one per ID in sequential
+// order. Exactly one of vocab_file and unigrams should be passed to this op.
+// If not specified, defaults to <>
+func FixedUnigramCandidateSamplerUnigrams(value []float32) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["unigrams"] = value
+	}
+}
+
+// FixedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FixedUnigramCandidateSamplerSeed(value int64) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// FixedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FixedUnigramCandidateSamplerSeed2(value int64) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
+//
+// A unigram sampler could use a fixed unigram distribution read from a
+// file or passed in as an in-memory array instead of building up the distribution
+// from data on the fly. There is also an option to skew the distribution by
+// applying a distortion power to the weights.
+//
+// The vocabulary file should be in CSV-like format, with the last field
+// being the weight associated with the word.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func FixedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...FixedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FixedUnigramCandidateSampler",
+		Input: []tf.Input{
+			true_classes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Conv2DAttr is an optional argument to Conv2D.
+type Conv2DAttr func(optionalAttr)
+
+// Conv2DUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DUseCudnnOnGpu(value bool) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
+	}
+}
+
+// Conv2DExplicitPaddings sets the optional explicit_paddings attribute to value.
+//
+// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+// dimension, the amount of padding inserted before and after the dimension is
+// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
+// If not specified, defaults to <>
+func Conv2DExplicitPaddings(value []int64) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
+// Conv2DDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func Conv2DDataFormat(value string) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv2DDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DDilations(value []int64) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 2-D convolution given 4-D `input` and `filter` tensors.
+//
+// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+// and a filter / kernel tensor of shape
+// `[filter_height, filter_width, in_channels, out_channels]`, this op
+// performs the following:
+//
+// 1. Flattens the filter to a 2-D matrix with shape
+//    `[filter_height * filter_width * in_channels, output_channels]`.
+// 2. Extracts image patches from the input tensor to form a *virtual*
+//    tensor of shape `[batch, out_height, out_width,
+//    filter_height * filter_width * in_channels]`.
+// 3. For each patch, right-multiplies the filter matrix and the image patch
+//    vector.
+//
+// In detail, with the default NHWC format,
+//
+//     output[b, i, j, k] =
+//         sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
+//                         filter[di, dj, q, k]
+//
+// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+//
+// Arguments:
+//	input: A 4-D tensor. The dimension order is interpreted according to the value
+// of `data_format`, see below for details.
+//	filter: A 4-D tensor of shape
+// `[filter_height, filter_width, in_channels, out_channels]`
+//	strides: 1-D tensor of length 4.  The stride of the sliding window for each
+// dimension of `input`. The dimension order is determined by the value of
+// `data_format`, see below for details.
+//	padding: The type of padding algorithm to use.
+//
+// Returns A 4-D tensor. The dimension order is determined by the value of
+// `data_format`, see below for details.
+func Conv2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv2DAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv2D",
+		Input: []tf.Input{
+			input, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deserialize `SparseTensor` objects.
+//
+// The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
+// the last dimension stores serialized `SparseTensor` objects and the other N
+// dimensions (N >= 0) correspond to a batch. The ranks of the original
+// `SparseTensor` objects must all match. When the final `SparseTensor` is
+// created, its rank is the rank of the incoming `SparseTensor` objects plus N;
+// the sparse tensors have been concatenated along new dimensions, one for each
+// batch.
+//
+// The output `SparseTensor` object's shape values for the original dimensions
+// are the max across the input `SparseTensor` objects' shape values for the
+// corresponding dimensions. The new dimensions match the size of the batch.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
+//
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+//
+// and
+//
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+//
+// then the final deserialized `SparseTensor` will be:
+//
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+//
+// Arguments:
+//	serialized_sparse: The serialized `SparseTensor` objects. The last dimension
+// must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "DeserializeSparse",
+		Input: []tf.Input{
+			serialized_sparse,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// QuantizedReluXAttr is an optional argument to QuantizedReluX.
+type QuantizedReluXAttr func(optionalAttr)
+
+// QuantizedReluXOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
+//
+// Arguments:
+//
+//
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedReluX",
+		Input: []tf.Input{
+			features, max_value, min_features, max_features,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr is an optional argument to RetrieveTPUEmbeddingStochasticGradientDescentParameters.
+type RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingStochasticGradientDescentParametersTableId(value int64) RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingStochasticGradientDescentParametersTableName(value string) RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve SGD embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the stochastic gradient descent optimization algorithm.
+func RetrieveTPUEmbeddingStochasticGradientDescentParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr) (parameters tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingStochasticGradientDescentParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds sparse updates to the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] += updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] += updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions add.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterAdd(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterAdd",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ShardDatasetAttr is an optional argument to ShardDataset.
+type ShardDatasetAttr func(optionalAttr)
+
+// ShardDatasetRequireNonEmpty sets the optional require_non_empty attribute to value.
+// If not specified, defaults to false
+func ShardDatasetRequireNonEmpty(value bool) ShardDatasetAttr {
+	return func(m optionalAttr) {
+		m["require_non_empty"] = value
+	}
+}
+
+// Creates a `Dataset` that includes only 1/`num_shards` of this dataset.
+//
+// Arguments:
+//
+//	num_shards: An integer representing the number of shards operating in parallel.
+//	index: An integer representing the current worker index.
+//
+//
+func ShardDataset(scope *Scope, input_dataset tf.Output, num_shards tf.Output, index tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ShardDatasetAttr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ShardDataset",
+		Input: []tf.Input{
+			input_dataset, num_shards, index,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs a `Summary` protocol buffer with scalar values.
+//
+// The input `tags` and `values` must have the same shape.  The generated summary
+// has a summary value for each tag-value pair in `tags` and `values`.
+//
+// Arguments:
+//	tags: Tags for the summary.
+//	values: Same shape as `tags.  Values for the summary.
+//
+// Returns Scalar.  Serialized `Summary` protocol buffer.
+func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ScalarSummary",
+		Input: []tf.Input{
+			tags, values,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds two `SparseTensor` objects to produce another `SparseTensor`.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in standard
+// lexicographic order.  If this is not the case, before this step run
+// `SparseReorder` to restore index ordering.
+//
+// By default, if two values sum to zero at some index, the output `SparseTensor`
+// would still include that particular location in its index, storing a zero in the
+// corresponding value slot.  To override this, callers can specify `thresh`,
+// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
+// corresponding value and index would then not be included.  In particular,
+// `thresh == 0` (default) means everything is kept and actual thresholding happens
+// only for a positive value.
+//
+// In the following shapes, `nnz` is the count after taking `thresh` into account.
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
+//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
+//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
+//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
+// pair takes space.
+func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseAdd",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// The gradient operator for the SparseAdd op.
+//
+// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
+// as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
+// non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
+// values of A and B.
+//
+// Arguments:
+//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
+// the non-empty values of the sum.
+//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
+//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
+//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
+// `[nnz(sum), ndims]`.
+//
+// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
+// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
+// non-empty values of B.
+func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseAddGrad",
+		Input: []tf.Input{
+			backprop_val_grad, a_indices, b_indices, sum_indices,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
+type QuantizedMatMulAttr func(optionalAttr)
+
+// QuantizedMatMulToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMatMulToutput(value tf.DataType) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// QuantizedMatMulTransposeA sets the optional transpose_a attribute to value.
+//
+// value: If true, `a` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulTransposeA(value bool) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// QuantizedMatMulTransposeB sets the optional transpose_b attribute to value.
+//
+// value: If true, `b` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulTransposeB(value bool) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// QuantizedMatMulTactivation sets the optional Tactivation attribute to value.
+//
+// value: The type of output produced by activation function
+// following this operation.
+// If not specified, defaults to DT_QUINT8
+func QuantizedMatMulTactivation(value tf.DataType) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["Tactivation"] = value
+	}
+}
+
+// Perform a quantized matrix multiplication of  `a` by the matrix `b`.
+//
+// The inputs must be two-dimensional matrices and the inner dimension of
+// `a` (after being transposed if `transpose_a` is non-zero) must match the
+// outer dimension of `b` (after being transposed if `transposed_b` is
+// non-zero).
+//
+// Arguments:
+//	a: Must be a two-dimensional tensor.
+//	b: Must be a two-dimensional tensor.
+//	min_a: The float value that the lowest quantized `a` value represents.
+//	max_a: The float value that the highest quantized `a` value represents.
+//	min_b: The float value that the lowest quantized `b` value represents.
+//	max_b: The float value that the highest quantized `b` value represents.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedMatMul",
+		Input: []tf.Input{
+			a, b, min_a, max_a, min_b, max_b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
+type ResourceApplyFtrlAttr func(optionalAttr)
+
+// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the Ftrl-proximal scheme.
+//
+// accum_new = accum + grad * grad
+// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 regulariation. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyFtrl",
+		Input: []tf.Input{
+			var_, accum, linear, grad, lr, l1, l2, lr_power,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Runs multiple additive regression ensemble predictors on input instances and
+//
+// computes the update to cached logits. It is designed to be used during training.
+// It traverses the trees starting from cached tree id and cached node id and
+// calculates the updates to be pushed to the cache.
+//
+// Arguments:
+//
+//	cached_tree_ids: Rank 1 Tensor containing cached tree ids which is the starting
+// tree of prediction.
+//	cached_node_ids: Rank 1 Tensor containing cached node id which is the starting
+// node of prediction.
+//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
+// feature.
+//	logits_dimension: scalar, dimension of the logits, to be used for partial logits
+// shape.
+//
+// Returns Rank 2 Tensor containing logits update (with respect to cached
+// values stored) for each example.Rank 1 Tensor containing new tree ids for each example.Rank 1 Tensor containing new node ids in the new tree_ids.
+func BoostedTreesTrainingPredict(scope *Scope, tree_ensemble_handle tf.Output, cached_tree_ids tf.Output, cached_node_ids tf.Output, bucketized_features []tf.Output, logits_dimension int64) (partial_logits tf.Output, tree_ids tf.Output, node_ids tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesTrainingPredict",
+		Input: []tf.Input{
+			tree_ensemble_handle, cached_tree_ids, cached_node_ids, tf.OutputList(bucketized_features),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// InfeedEnqueueTupleAttr is an optional argument to InfeedEnqueueTuple.
+type InfeedEnqueueTupleAttr func(optionalAttr)
+
+// InfeedEnqueueTupleLayouts sets the optional layouts attribute to value.
+//
+// value: A vector holding the requested layout in minor-to-major sequence for
+// all the tuple shapes, in the order the shapes appear in the "shapes" input.
+// The layout elements for a sub-shape can be set to -1, in which case the
+// corresponding layout will be computed by the infeed operation.
+// If not specified, defaults to <>
+func InfeedEnqueueTupleLayouts(value []int64) InfeedEnqueueTupleAttr {
+	return func(m optionalAttr) {
+		m["layouts"] = value
+	}
+}
+
+// InfeedEnqueueTupleDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. This should be -1 when the Op
+// is running on a TPU device, and >= 0 when the Op is running on the CPU
+// device.
+// If not specified, defaults to -1
+func InfeedEnqueueTupleDeviceOrdinal(value int64) InfeedEnqueueTupleAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// Feeds multiple Tensor values into the computation as an XLA tuple.
+//
+// Arguments:
+//	inputs: A list of tensors that will be provided using the infeed mechanism.
+//	shapes: The shapes of each tensor in `inputs`.
+//
+// Returns the created operation.
+func InfeedEnqueueTuple(scope *Scope, inputs []tf.Output, shapes []tf.Shape, optional ...InfeedEnqueueTupleAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shapes": shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "InfeedEnqueueTuple",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// OutfeedDequeueTupleAttr is an optional argument to OutfeedDequeueTuple.
+type OutfeedDequeueTupleAttr func(optionalAttr)
+
+// OutfeedDequeueTupleDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. This should be -1 when the Op
+// is running on a TPU device, and >= 0 when the Op is running on the CPU
+// device.
+// If not specified, defaults to -1
+func OutfeedDequeueTupleDeviceOrdinal(value int64) OutfeedDequeueTupleAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// Retrieve multiple values from the computation outfeed.
+//
+// This operation will block indefinitely until data is available. Output `i`
+// corresponds to XLA tuple element `i`.
+//
+// Arguments:
+//	dtypes: The element types of each element in `outputs`.
+//	shapes: The shapes of each tensor in `outputs`.
+//
+// Returns A list of tensors that will be read from the outfeed.
+func OutfeedDequeueTuple(scope *Scope, dtypes []tf.DataType, shapes []tf.Shape, optional ...OutfeedDequeueTupleAttr) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes, "shapes": shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OutfeedDequeueTuple",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("OutfeedDequeueTuple", err)
+		return
+	}
+	return outputs
+}
+
+// DecodeCSVAttr is an optional argument to DecodeCSV.
+type DecodeCSVAttr func(optionalAttr)
+
+// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
+//
+// value: char delimiter to separate fields in a record.
+// If not specified, defaults to ","
+func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["field_delim"] = value
+	}
+}
+
+// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
+//
+// value: If false, treats double quotation marks as regular
+// characters inside of the string fields (ignoring RFC 4180, Section 2,
+// Bullet 5).
+// If not specified, defaults to true
+func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["use_quote_delim"] = value
+	}
+}
+
+// DecodeCSVNaValue sets the optional na_value attribute to value.
+//
+// value: Additional string to recognize as NA/NaN.
+// If not specified, defaults to ""
+func DecodeCSVNaValue(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["na_value"] = value
+	}
+}
+
+// DecodeCSVSelectCols sets the optional select_cols attribute to value.
+// If not specified, defaults to <>
+func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["select_cols"] = value
+	}
+}
+
+// Convert CSV records to tensors. Each column maps to one tensor.
+//
+// RFC 4180 format is expected for the CSV records.
+// (https://tools.ietf.org/html/rfc4180)
+// Note that we allow leading and trailing spaces with int or float field.
+//
+// Arguments:
+//	records: Each string is a record/row in the csv and all records should have
+// the same format.
+//	record_defaults: One tensor per column of the input record, with either a
+// scalar default value for that column or an empty vector if the column is
+// required.
+//
+// Returns Each tensor will have the same shape as records.
+func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeCSV",
+		Input: []tf.Input{
+			records, tf.OutputList(record_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("DecodeCSV", err)
+		return
+	}
+	return output
+}
+
+// Returns the complex conjugate of a complex number.
+//
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// complex numbers that are the complex conjugate of each element in `input`. The
+// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
+// real part and *b* is the imaginary part.
+//
+// The complex conjugate returned by this operation is of the form \\(a - bj\\).
+//
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
+// ```
+func Conj(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Conj",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradient for the sqrt of `x` wrt its input.
+//
+// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func SqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SqrtGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Transforms a vector of brain.Example protos (as strings) into typed tensors.
+//
+// Arguments:
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	names: A vector containing the names of the serialized protos.
+// May contain, for example, table key (descriptive) names for the
+// corresponding serialized protos.  These are purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty vector if no names are available.
+// If non-empty, this vector must be the same length as "serialized".
+//	sparse_keys: A list of Nsparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: A list of Ndense string Tensors (scalars).
+// The keys expected in the Examples' features associated with dense values.
+//	dense_defaults: A list of Ndense Tensors (some may be empty).
+// dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	sparse_types: A list of Nsparse types; the data types of data in each Feature
+// given in sparse_keys.
+// Currently the ParseExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
+// given in dense_keys.
+// The number of elements in the Feature corresponding to dense_key[j]
+// must always equal dense_shapes[j].NumEntries().
+// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
+// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
+// The dense outputs are just the inputs row-stacked by batch.
+// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
+// the shape of the output Tensor dense_values[j] will be
+// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
+// of elements of length D1 * .... * DN, across all minibatch entries
+// in the input.  Any minibatch entry with less than M blocks of elements of
+// length D1 * ... * DN will be padded with the corresponding default_value
+// scalar element along the second dimension.
+func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
+	opspec := tf.OpSpec{
+		Type: "ParseExample",
+		Input: []tf.Input{
+			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
+}
+
+// TryRpcAttr is an optional argument to TryRpc.
+type TryRpcAttr func(optionalAttr)
+
+// TryRpcProtocol sets the optional protocol attribute to value.
+//
+// value: RPC protocol to use.  Empty string means use the default protocol.
+// Options include 'grpc'.
+// If not specified, defaults to ""
+func TryRpcProtocol(value string) TryRpcAttr {
+	return func(m optionalAttr) {
+		m["protocol"] = value
+	}
+}
+
+// TryRpcFailFast sets the optional fail_fast attribute to value.
+//
+// value: `boolean`. If `true` (default), then failures to connect
+// (i.e., the server does not immediately respond) cause an RPC failure.
+// If not specified, defaults to true
+func TryRpcFailFast(value bool) TryRpcAttr {
+	return func(m optionalAttr) {
+		m["fail_fast"] = value
+	}
+}
+
+// TryRpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
+//
+// value: `int`. If `0` (default), then the kernel will run the RPC
+// request and only time out if the RPC deadline passes or the session times out.
+// If this value is greater than `0`, then the op will raise an exception if
+// the RPC takes longer than `timeout_in_ms`.
+// If not specified, defaults to 0
+func TryRpcTimeoutInMs(value int64) TryRpcAttr {
+	return func(m optionalAttr) {
+		m["timeout_in_ms"] = value
+	}
+}
+
+// Perform batches of RPC requests.
+//
+// This op asynchronously performs either a single RPC request, or a batch
+// of requests.  RPC requests are defined by three main parameters:
+//
+//   - `address` (the host+port or BNS address of the request)
+//   - `method` (the method name for the request)
+//   - `request` (the serialized proto string, or vector of strings,
+//      of the RPC request argument).
+//
+// For example, if you have an RPC service running on port localhost:2345,
+// and its interface is configured with the following proto declaration:
+//
+// ```
+// service MyService {
+//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
+//   }
+// };
+// ```
+//
+// then call this op with arguments:
+//
+// ```
+// address = "localhost:2345"
+// method = "MyService/MyMethod"
+// ```
+//
+// The `request` tensor is a string tensor representing serialized `MyRequestProto`
+// strings; and the output string tensor `response` will have the same shape
+// and contain (upon successful completion) corresponding serialized
+// `MyResponseProto` strings.
+//
+// For example, to send a single, empty, `MyRequestProto`, call
+// this op with `request = ""`.  To send 5 **parallel** empty requests,
+// call this op with `request = ["", "", "", "", ""]`.
+//
+// More generally, one can create a batch of `MyRequestProto` serialized protos
+// from regular batched tensors using the `encode_proto` op, and convert
+// the response `MyResponseProto` serialized protos to batched tensors
+// using the `decode_proto` op.
+//
+// **NOTE** Working with serialized proto strings is faster than instantiating
+// actual proto objects in memory, so no performance degradation is expected
+// compared to writing custom kernels for this workflow.
+//
+// Unlike the standard `Rpc` op, if the connection fails or the remote worker
+// returns an error status, this op does **not** reraise the exception.
+// Instead, the `status_code` and `status_message` entry for the corresponding RPC
+// call is set with the error returned from the RPC call.  The `response` tensor
+// will contain valid response values for those minibatch entries whose RPCs did
+// not fail; the rest of the entries will have empty strings.
+//
+// Arguments:
+//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `method` and `request`.
+//	method: `0-D` or `1-D`.  The method address on the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `request`.
+//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `method`.
+//
+// Returns Same shape as `request`. Serialized proto strings: the rpc responses.Same shape as `request`.  Values correspond to tensorflow Status enum codes.Same shape as `request`.  Values correspond to Status messages
+// returned from the RPC calls.
+func TryRpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...TryRpcAttr) (response tf.Output, status_code tf.Output, status_message tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TryRpc",
+		Input: []tf.Input{
+			address, method, request,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Does nothing. Only useful as a placeholder for control edges.
+//
+// Returns the created operation.
+func NoOp(scope *Scope) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NoOp",
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Generate a sharded filename. The filename is printf formatted as
+//
+//    %s-%05d-of-%05d, basename, shard, num_shards.
+func ShardedFilename(scope *Scope, basename tf.Output, shard tf.Output, num_shards tf.Output) (filename tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ShardedFilename",
+		Input: []tf.Input{
+			basename, shard, num_shards,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPoolGradGradWithArgmaxAttr is an optional argument to MaxPoolGradGradWithArgmax.
+type MaxPoolGradGradWithArgmaxAttr func(optionalAttr)
+
+// MaxPoolGradGradWithArgmaxIncludeBatchInIndex sets the optional include_batch_in_index attribute to value.
+//
+// value: Whether to include batch dimension in flattened index of `argmax`.
+// If not specified, defaults to false
+func MaxPoolGradGradWithArgmaxIncludeBatchInIndex(value bool) MaxPoolGradGradWithArgmaxAttr {
+	return func(m optionalAttr) {
+		m["include_batch_in_index"] = value
+	}
+}
+
+// Computes second-order gradients of the maxpooling function.
+//
+// Arguments:
+//	input: The original input.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+// input of `max_pool`.
+//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients of gradients w.r.t. the input of `max_pool`.
+func MaxPoolGradGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradWithArgmaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10368,9 +16285,9 @@ func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGrad",
+		Type: "MaxPoolGradGradWithArgmax",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			input, grad, argmax,
 		},
 		Attrs: attrs,
 	}
@@ -10378,165 +16295,88 @@ func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output,
 	return op.Output(0)
 }
 
-// UnbatchGradAttr is an optional argument to UnbatchGrad.
-type UnbatchGradAttr func(optionalAttr)
+// MatrixTriangularSolveAttr is an optional argument to MatrixTriangularSolve.
+type MatrixTriangularSolveAttr func(optionalAttr)
 
-// UnbatchGradContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func UnbatchGradContainer(value string) UnbatchGradAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// UnbatchGradSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func UnbatchGradSharedName(value string) UnbatchGradAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Gradient of Unbatch.
+// MatrixTriangularSolveLower sets the optional lower attribute to value.
 //
-// Acts like Batch but using the given batch_index index of batching things as they
-// become available. This ensures that the gradients are propagated back in the
-// same session which did the forward pass.
-//
-// original_input: The input to the Unbatch operation this is the gradient of.
-// batch_index: The batch_index given to the Unbatch operation this is the gradient
-// of.
-// grad: The downstream gradient.
-// id: The id scalar emitted by Batch.
-// batched_grad: The return value, either an empty tensor or the batched gradient.
-// container: Container to control resource sharing.
-// shared_name: Instances of UnbatchGrad with the same container and shared_name
-//  are assumed to possibly belong to the same batch. If left empty, the op name
-//  will be used as the shared name.
-func UnbatchGrad(scope *Scope, original_input tf.Output, batch_index tf.Output, grad tf.Output, id tf.Output, optional ...UnbatchGradAttr) (batched_grad tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UnbatchGrad",
-		Input: []tf.Input{
-			original_input, batch_index, grad, id,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// BiasAddAttr is an optional argument to BiasAdd.
-type BiasAddAttr func(optionalAttr)
-
-// BiasAddDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the bias tensor will be added to the last dimension
-// of the value tensor.
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// The tensor will be added to "in_channels", the third-to-the-last
-//     dimension.
-// If not specified, defaults to "NHWC"
-func BiasAddDataFormat(value string) BiasAddAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Adds `bias` to `value`.
-//
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
-//
-// Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
-//
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "BiasAdd",
-		Input: []tf.Input{
-			value, bias,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FusedBatchNormGradV2Attr is an optional argument to FusedBatchNormGradV2.
-type FusedBatchNormGradV2Attr func(optionalAttr)
-
-// FusedBatchNormGradV2Epsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormGradV2Epsilon(value float32) FusedBatchNormGradV2Attr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
-
-// FusedBatchNormGradV2DataFormat sets the optional data_format attribute to value.
-//
-// value: The data format for y_backprop, x, x_backprop.
-// Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormGradV2DataFormat(value string) FusedBatchNormGradV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// FusedBatchNormGradV2IsTraining sets the optional is_training attribute to value.
-//
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
+// value: Boolean indicating whether the innermost matrices in `matrix` are
+// lower or upper triangular.
 // If not specified, defaults to true
-func FusedBatchNormGradV2IsTraining(value bool) FusedBatchNormGradV2Attr {
+func MatrixTriangularSolveLower(value bool) MatrixTriangularSolveAttr {
 	return func(m optionalAttr) {
-		m["is_training"] = value
+		m["lower"] = value
 	}
 }
 
-// Gradient for batch normalization.
+// MatrixTriangularSolveAdjoint sets the optional adjoint attribute to value.
 //
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+//          adjoint.
+//
+// @compatibility(numpy)
+// Equivalent to scipy.linalg.solve_triangular
+// @end_compatibility
+// If not specified, defaults to false
+func MatrixTriangularSolveAdjoint(value bool) MatrixTriangularSolveAttr {
+	return func(m optionalAttr) {
+		m["adjoint"] = value
+	}
+}
+
+// Solves systems of linear equations with upper or lower triangular matrices by backsubstitution.
+//
+//
+// `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
+// square matrices. If `lower` is `True` then the strictly upper triangular part
+// of each inner-most matrix is assumed to be zero and not accessed.
+// If `lower` is False then the strictly lower triangular part of each inner-most
+// matrix is assumed to be zero and not accessed.
+// `rhs` is a tensor of shape `[..., M, K]`.
+//
+// The output is a tensor of shape `[..., M, K]`. If `adjoint` is
+// `True` then the innermost matrices in `output` satisfy matrix equations
+// `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `False` then the strictly then the  innermost matrices in
+// `output` satisfy matrix equations
+// `adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
+//
+// Example:
+// ```python
+//
+// a = tf.constant([[3,  0,  0,  0],
+//                  [2,  1,  0,  0],
+//                  [1,  0,  1,  0],
+//                  [1,  1,  1,  1]], dtype=tf.float32)
+//
+// b = tf.constant([[4],
+//                  [2],
+//                  [4],
+//                  [2]], dtype=tf.float32)
+//
+// x = tf.linalg.triangular_solve(a, b, lower=True)
+// x
+// # <tf.Tensor: id=257, shape=(4, 1), dtype=float32, numpy=
+// # array([[ 1.3333334 ],
+// #        [-0.66666675],
+// #        [ 2.6666665 ],
+// #        [-1.3333331 ]], dtype=float32)>
+//
+// # in python3 one can use `a@x`
+// tf.matmul(a, x)
+// # <tf.Tensor: id=263, shape=(4, 1), dtype=float32, numpy=
+// # array([[4.       ],
+// #        [2.       ],
+// #        [4.       ],
+// #        [1.9999999]], dtype=float32)>
+// ```
 //
 // Arguments:
-//	y_backprop: A 4D Tensor for the gradient with respect to y.
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
-// mean to be reused in gradient computation. When is_training is
-// False, a 1D Tensor for the population mean to be reused in both
-// 1st and 2nd order gradient computation.
-//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
-// variance (inverted variance in the cuDNN case) to be reused in
-// gradient computation. When is_training is False, a 1D Tensor
-// for the population variance to be reused in both 1st and 2nd
-// order gradient computation.
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
 //
-// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
-// in FusedBatchNorm.
-func FusedBatchNormGradV2(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradV2Attr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
+// Returns Shape is `[..., M, K]`.
+func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixTriangularSolveAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10545,14 +16385,29 @@ func FusedBatchNormGradV2(scope *Scope, y_backprop tf.Output, x tf.Output, scale
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNormGradV2",
+		Type: "MatrixTriangularSolve",
 		Input: []tf.Input{
-			y_backprop, x, scale, reserve_space_1, reserve_space_2,
+			matrix, rhs,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
+}
+
+// Computes the derivative of a Gamma random sample w.r.t. `alpha`.
+func RandomGammaGrad(scope *Scope, alpha tf.Output, sample tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomGammaGrad",
+		Input: []tf.Input{
+			alpha, sample,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
 // FusedBatchNormGradAttr is an optional argument to FusedBatchNormGrad.
@@ -10630,97 +16485,201 @@ func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale t
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// EmptyAttr is an optional argument to Empty.
-type EmptyAttr func(optionalAttr)
+// LoadTPUEmbeddingAdadeltaParametersAttr is an optional argument to LoadTPUEmbeddingAdadeltaParameters.
+type LoadTPUEmbeddingAdadeltaParametersAttr func(optionalAttr)
 
-// EmptyInit sets the optional init attribute to value.
+// LoadTPUEmbeddingAdadeltaParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: If True, initialize the returned tensor with the default value of dtype.  Otherwise, the implementation is free not to initializethe tensor's content.
-// If not specified, defaults to false
-func EmptyInit(value bool) EmptyAttr {
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingAdadeltaParametersTableId(value int64) LoadTPUEmbeddingAdadeltaParametersAttr {
 	return func(m optionalAttr) {
-		m["init"] = value
+		m["table_id"] = value
 	}
 }
 
-// Creates a tensor with the given shape.
-//
-// This operation creates a tensor of `shape` and `dtype`.
-//
-// Arguments:
-//	shape: 1-D. Represents the shape of the output tensor.
-//
-//
-// Returns A `Tensor` of type `T`.
-func Empty(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...EmptyAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Empty",
-		Input: []tf.Input{
-			shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes softsign gradients for a softsign operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding softsign operation.
-//	features: The features passed as input to the corresponding softsign operation.
-//
-// Returns The gradients: `gradients / (1 + abs(features)) ** 2`.
-func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SoftsignGrad",
-		Input: []tf.Input{
-			gradients, features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DecodeCompressedAttr is an optional argument to DecodeCompressed.
-type DecodeCompressedAttr func(optionalAttr)
-
-// DecodeCompressedCompressionType sets the optional compression_type attribute to value.
-//
-// value: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
+// LoadTPUEmbeddingAdadeltaParametersTableName sets the optional table_name attribute to value.
 // If not specified, defaults to ""
-func DecodeCompressedCompressionType(value string) DecodeCompressedAttr {
+func LoadTPUEmbeddingAdadeltaParametersTableName(value string) LoadTPUEmbeddingAdadeltaParametersAttr {
 	return func(m optionalAttr) {
-		m["compression_type"] = value
+		m["table_name"] = value
 	}
 }
 
-// Decompress strings.
+// Load Adadelta embedding parameters.
 //
-// This op decompresses each element of the `bytes` input `Tensor`, which
-// is assumed to be compressed using the given `compression_type`.
-//
-// The `output` is a string `Tensor` of the same shape as `bytes`,
-// each element containing the decompressed data from the corresponding
-// element in `bytes`.
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
 // Arguments:
-//	bytes: A Tensor of string which is compressed.
+//	parameters: Value of parameters used in the Adadelta optimization algorithm.
+//	accumulators: Value of accumulators used in the Adadelta optimization algorithm.
+//	updates: Value of updates used in the Adadelta optimization algorithm.
 //
-// Returns A Tensor with the same shape as input `bytes`, uncompressed
-// from bytes.
-func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompressedAttr) (output tf.Output) {
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingAdadeltaParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, updates tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdadeltaParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingAdadeltaParameters",
+		Input: []tf.Input{
+			parameters, accumulators, updates,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Multiplies sparse updates into the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] *= updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] *= updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions multiply.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterMul(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterMul",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// UnicodeEncodeAttr is an optional argument to UnicodeEncode.
+type UnicodeEncodeAttr func(optionalAttr)
+
+// UnicodeEncodeErrors sets the optional errors attribute to value.
+//
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeEncodeErrors(value string) UnicodeEncodeAttr {
+	return func(m optionalAttr) {
+		m["errors"] = value
+	}
+}
+
+// UnicodeEncodeReplacementChar sets the optional replacement_char attribute to value.
+//
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD (U+65533).
+// If not specified, defaults to 65533
+func UnicodeEncodeReplacementChar(value int64) UnicodeEncodeAttr {
+	return func(m optionalAttr) {
+		m["replacement_char"] = value
+	}
+}
+
+// Encode a tensor of ints into unicode strings.
+//
+// Returns a vector of strings, where `output[i]` is constructed by encoding the
+// Unicode codepoints in `input_values[input_splits[i]:input_splits[i+1]]`
+// using `output_encoding`.
+//
+// ---
+//
+// Example:
+//
+// ```
+// input_values = [72, 101, 108, 108, 111, 87, 111, 114, 108, 100]
+// input_splits = [0, 5, 10]
+// output_encoding = 'UTF-8'
+//
+// output = ['Hello', 'World']
+// ```
+//
+// Arguments:
+//	input_values: A 1D tensor containing the unicode codepoints that should be encoded.
+//	input_splits: A 1D tensor specifying how the unicode codepoints should be split into strings.
+// In particular, `output[i]` is constructed by encoding the codepoints in the
+// slice `input_values[input_splits[i]:input_splits[i+1]]`.
+//	output_encoding: Unicode encoding of the output strings. Valid encodings are: `"UTF-8",
+// "UTF-16-BE", and "UTF-32-BE"`.
+//
+// Returns The 1-D Tensor of strings encoded from the provided unicode codepoints.
+func UnicodeEncode(scope *Scope, input_values tf.Output, input_splits tf.Output, output_encoding string, optional ...UnicodeEncodeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_encoding": output_encoding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UnicodeEncode",
+		Input: []tf.Input{
+			input_values, input_splits,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StackPushV2Attr is an optional argument to StackPushV2.
+type StackPushV2Attr func(optionalAttr)
+
+// StackPushV2SwapMemory sets the optional swap_memory attribute to value.
+//
+// value: Swap `elem` to CPU. Default to false.
+// If not specified, defaults to false
+func StackPushV2SwapMemory(value bool) StackPushV2Attr {
+	return func(m optionalAttr) {
+		m["swap_memory"] = value
+	}
+}
+
+// Push an element onto the stack.
+//
+// Arguments:
+//	handle: The handle to a stack.
+//	elem: The tensor to be pushed onto the stack.
+//
+// Returns The same tensor as the input 'elem'.
+func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...StackPushV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10729,9 +16688,9 @@ func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompresse
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeCompressed",
+		Type: "StackPushV2",
 		Input: []tf.Input{
-			bytes,
+			handle, elem,
 		},
 		Attrs: attrs,
 	}
@@ -10739,60 +16698,371 @@ func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompresse
 	return op.Output(0)
 }
 
-// FusedBatchNormV2Attr is an optional argument to FusedBatchNormV2.
-type FusedBatchNormV2Attr func(optionalAttr)
+// SparseMatMulAttr is an optional argument to SparseMatMul.
+type SparseMatMulAttr func(optionalAttr)
 
-// FusedBatchNormV2Epsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormV2Epsilon(value float32) FusedBatchNormV2Attr {
+// SparseMatMulTransposeA sets the optional transpose_a attribute to value.
+// If not specified, defaults to false
+func SparseMatMulTransposeA(value bool) SparseMatMulAttr {
 	return func(m optionalAttr) {
-		m["epsilon"] = value
+		m["transpose_a"] = value
 	}
 }
 
-// FusedBatchNormV2DataFormat sets the optional data_format attribute to value.
-//
-// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormV2DataFormat(value string) FusedBatchNormV2Attr {
+// SparseMatMulTransposeB sets the optional transpose_b attribute to value.
+// If not specified, defaults to false
+func SparseMatMulTransposeB(value bool) SparseMatMulAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["transpose_b"] = value
 	}
 }
 
-// FusedBatchNormV2IsTraining sets the optional is_training attribute to value.
+// SparseMatMulAIsSparse sets the optional a_is_sparse attribute to value.
+// If not specified, defaults to false
+func SparseMatMulAIsSparse(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["a_is_sparse"] = value
+	}
+}
+
+// SparseMatMulBIsSparse sets the optional b_is_sparse attribute to value.
+// If not specified, defaults to false
+func SparseMatMulBIsSparse(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["b_is_sparse"] = value
+	}
+}
+
+// Multiply matrix "a" by matrix "b".
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
+// The inputs must be two-dimensional matrices and the inner dimension of "a" must
+// match the outer dimension of "b". Both "a" and "b" must be `Tensor`s not
+// `SparseTensor`s.  This op is optimized for the case where at least one of "a" or
+// "b" is sparse, in the sense that they have a large proportion of zero values.
+// The breakeven for using this versus a dense matrix multiply on one platform was
+// 30% zero values in the sparse matrix.
+//
+// The gradient computation of this operation will only take advantage of sparsity
+// in the input gradient when that gradient comes from a Relu.
+func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatMulAttr) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseMatMul",
+		Input: []tf.Input{
+			a, b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Concatenates quantized tensors along one dimension.
+//
+// Arguments:
+//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [0, rank(values)).
+//	values: The `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	input_mins: The minimum scalar values for each of the input tensors.
+//	input_maxes: The maximum scalar values for each of the input tensors.
+//
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizedConcat(scope *Scope, concat_dim tf.Output, values []tf.Output, input_mins []tf.Output, input_maxes []tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedConcat",
+		Input: []tf.Input{
+			concat_dim, tf.OutputList(values), tf.OutputList(input_mins), tf.OutputList(input_maxes),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// LoadTPUEmbeddingRMSPropParametersAttr is an optional argument to LoadTPUEmbeddingRMSPropParameters.
+type LoadTPUEmbeddingRMSPropParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingRMSPropParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingRMSPropParametersTableId(value int64) LoadTPUEmbeddingRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingRMSPropParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingRMSPropParametersTableName(value string) LoadTPUEmbeddingRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load RMSProp embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the RMSProp optimization algorithm.
+//	ms: Value of ms used in the RMSProp optimization algorithm.
+//	mom: Value of mom used in the RMSProp optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingRMSPropParameters(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingRMSPropParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingRMSPropParameters",
+		Input: []tf.Input{
+			parameters, ms, mom,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// TopKAttr is an optional argument to TopK.
+type TopKAttr func(optionalAttr)
+
+// TopKSorted sets the optional sorted attribute to value.
+//
+// value: If true the resulting `k` elements will be sorted by the values in
+// descending order.
 // If not specified, defaults to true
-func FusedBatchNormV2IsTraining(value bool) FusedBatchNormV2Attr {
+func TopKSorted(value bool) TopKAttr {
 	return func(m optionalAttr) {
-		m["is_training"] = value
+		m["sorted"] = value
 	}
 }
 
-// Batch normalization.
+// Finds values and indices of the `k` largest elements for the last dimension.
 //
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// DEPRECATED at GraphDef version 7: Use TopKV2 instead
+//
+// If the input is a vector (rank-1), finds the `k` largest entries in the vector
+// and outputs their values and indices as vectors.  Thus `values[j]` is the
+// `j`-th largest entry in `input`, and its index is `indices[j]`.
+//
+// For matrices (resp. higher rank input), computes the top `k` entries in each
+// row (resp. vector along the last dimension).  Thus,
+//
+//     values.shape = indices.shape = input.shape[:-1] + [k]
+//
+// If two elements are equal, the lower-index element appears first.
+//
+// If `k` varies dynamically, use `TopKV2` below.
 //
 // Arguments:
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	offset: A 1D Tensor for offset, to shift to the normalized x.
-//	mean: A 1D Tensor for population mean. Used for inference only;
-// must be empty for training.
-//	variance: A 1D Tensor for population variance. Used for inference only;
-// must be empty for training.
+//	input: 1-D or higher with last dimension at least `k`.
+//	k: Number of top elements to look for along the last dimension (along each
+// row for matrices).
 //
-// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
-// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
-// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be reused in the gradient computation.
-func FusedBatchNormV2(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormV2Attr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
+// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
+func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values tf.Output, indices tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"k": k}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TopK",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
+//
+// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Maximum",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
+type SparseToSparseSetOperationAttr func(optionalAttr)
+
+// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Applies set operation along last dimension of 2 `SparseTensor` inputs.
+//
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+//
+// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
+// order and range of `set1` and `set2` indices.
+//
+// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
+// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set1`
+// and `set2` indices.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
+//
+// Arguments:
+//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
+// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"set_operation": set_operation}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseToSparseSetOperation",
+		Input: []tf.Input{
+			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Creates a tree resource and returns a handle to it.
+//
+// Arguments:
+//	tree_handle: Handle to the tree resource to be created.
+//	tree_config: Serialized proto string of the boosted_trees.Tree.
+//
+// Returns the created operation.
+func TensorForestCreateTreeVariable(scope *Scope, tree_handle tf.Output, tree_config tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorForestCreateTreeVariable",
+		Input: []tf.Input{
+			tree_handle, tree_config,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes the complementary error function of `x` element-wise.
+func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Erfc",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SelfAdjointEigV2Attr is an optional argument to SelfAdjointEigV2.
+type SelfAdjointEigV2Attr func(optionalAttr)
+
+// SelfAdjointEigV2ComputeV sets the optional compute_v attribute to value.
+//
+// value: If `True` then eigenvectors will be computed and returned in `v`.
+// Otherwise, only the eigenvalues will be computed.
+// If not specified, defaults to true
+func SelfAdjointEigV2ComputeV(value bool) SelfAdjointEigV2Attr {
+	return func(m optionalAttr) {
+		m["compute_v"] = value
+	}
+}
+
+// Computes the eigen decomposition of one or more square self-adjoint matrices.
+//
+// Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
+// `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`. The eigenvalues
+// are sorted in non-decreasing order.
+//
+// ```python
+// # a is a tensor.
+// # e is a tensor of eigenvalues.
+// # v is a tensor of eigenvectors.
+// e, v = self_adjoint_eig(a)
+// e = self_adjoint_eig(a, compute_v=False)
+// ```
+//
+// Arguments:
+//	input: `Tensor` input of shape `[N, N]`.
+//
+// Returns Eigenvalues. Shape is `[N]`.Eigenvectors. Shape is `[N, N]`.
+func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV2Attr) (e tf.Output, v tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10801,186 +17071,70 @@ func FusedBatchNormV2(scope *Scope, x tf.Output, scale tf.Output, offset tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNormV2",
+		Type: "SelfAdjointEigV2",
 		Input: []tf.Input{
-			x, scale, offset, mean, variance,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0), op.Output(1)
 }
 
-// Check if the input matches the regex pattern.
-//
-// The input is a string tensor of any shape. The pattern is a scalar
-// string tensor which is applied to every element of the input tensor.
-// The boolean values (True or False) of the output tensor indicate
-// if the input matches the regex pattern provided.
-//
-// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
-//
-// Arguments:
-//	input: A string tensor of the text to be processed.
-//	pattern: A scalar string tensor containing the regular expression to match the input.
-//
-// Returns A bool tensor with the same shape as `input`.
-func RegexFullMatch(scope *Scope, input tf.Output, pattern tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RegexFullMatch",
-		Input: []tf.Input{
-			input, pattern,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingFTRLParametersGradAccumDebug.
+type LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr func(optionalAttr)
 
-// Gradients for batch normalization.
+// LoadTPUEmbeddingFTRLParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
-//
-// This op is deprecated. See `tf.nn.batch_normalization`.
-//
-// Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this Tensor will be multiplied
-// with the normalized Tensor.
-//	backprop: 4D backprop Tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-//
-// Returns 4D backprop tensor for input.1D backprop tensor for mean.1D backprop tensor for variance.1D backprop tensor for beta.1D backprop tensor for gamma.
-func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
-	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalizationGrad",
-		Input: []tf.Input{
-			t, m, v, gamma, backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
-// Batch normalization.
-//
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
-//
-// This op is deprecated. Prefer `tf.nn.batch_normalization`.
-//
-// Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	beta: A 1D beta Tensor with size matching the last dimension of t.
-// An offset to be added to the normalized tensor.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this tensor will be multiplied
-// with the normalized tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
-	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalization",
-		Input: []tf.Input{
-			t, m, v, beta, gamma,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StringToNumberAttr is an optional argument to StringToNumber.
-type StringToNumberAttr func(optionalAttr)
-
-// StringToNumberOutType sets the optional out_type attribute to value.
-//
-// value: The numeric type to interpret each string in `string_tensor` as.
-// If not specified, defaults to DT_FLOAT
-func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingFTRLParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["table_id"] = value
 	}
 }
 
-// Converts each string in the input Tensor to the specified numeric type.
+// LoadTPUEmbeddingFTRLParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingFTRLParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load FTRL embedding parameters with debug support.
 //
-// (Note that int32 overflow results in an error while float overflow
-// results in a rounded value.)
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
+// Arguments:
+//	parameters: Value of parameters used in the FTRL optimization algorithm.
+//	accumulators: Value of accumulators used in the FTRL optimization algorithm.
+//	linears: Value of linears used in the FTRL optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the FTRL optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingFTRLParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, linears tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringToNumber",
+		Type: "LoadTPUEmbeddingFTRLParametersGradAccumDebug",
 		Input: []tf.Input{
-			string_tensor,
+			parameters, accumulators, linears, gradient_accumulators,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Convert JSON-encoded Example records to binary protocol buffer strings.
-//
-// This op translates a tensor containing Example records, encoded using
-// the [standard JSON
-// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
-// into a tensor containing the same records encoded as binary protocol
-// buffers. The resulting tensor can then be fed to any of the other
-// Example-parsing ops.
-//
-// Arguments:
-//	json_examples: Each string is a JSON object serialized according to the JSON
-// mapping of the Example proto.
-//
-// Returns Each string is a binary Example protocol buffer corresponding
-// to the respective element of `json_examples`.
-func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeJSONExample",
-		Input: []tf.Input{
-			json_examples,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
 // ParseSequenceExampleAttr is an optional argument to ParseSequenceExample.
@@ -11191,94 +17345,27 @@ func ParseSequenceExample(scope *Scope, serialized tf.Output, debug_name tf.Outp
 	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values, feature_list_dense_lengths
 }
 
-// Computes log softmax activations.
+// Creates a dataset that will write to / read from a snapshot.
 //
-// For each batch `i` and class `j` we have
-//
-//     logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
+// This dataset attempts to determine whether a valid snapshot exists at the
+// `snapshot_path`, and reads from the snapshot in lieu of using `input_dataset`.
+// If not, it will run the preprocessing pipeline as usual, and write out a
+// snapshot of the data processed for future use.
 //
 // Arguments:
-//	logits: 2-D with shape `[batch_size, num_classes]`.
+//	input_dataset: A variant tensor representing the input dataset.
+//	path: The path we should write snapshots to / read snapshots from.
 //
-// Returns Same shape as `logits`.
-func LogSoftmax(scope *Scope, logits tf.Output) (logsoftmax tf.Output) {
+//
+func SnapshotDataset(scope *Scope, input_dataset tf.Output, path tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "LogSoftmax",
+		Type: "SnapshotDataset",
 		Input: []tf.Input{
-			logits,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes rectified linear: `max(features, 0)`.
-func Relu(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Relu",
-		Input: []tf.Input{
-			features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the gradient for the sqrt of `x` wrt its input.
-//
-// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
-// is the corresponding input gradient.
-func SqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SqrtGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Encodes a `RaggedTensor` into a `variant` Tensor.
-//
-//
-// Encodes the given `RaggedTensor` and returns a `variant` Tensor. If
-// `batched_input` is True, then input `RaggedTensor` is unbatched along the
-// zero-th dimension, each component `RaggedTensor` is encoded into a scalar
-// `variant` Tensor, and these are stacked to return a 1-D `variant` Tensor.
-// If `batched_input` is False, then the input `RaggedTensor` is encoded as is and
-// a scalar `variant` Tensor is returned. A `RaggedTensor` is encoded by first
-// creating a 1-D `variant` Tensor with `ragged_rank + 1` elements, containing the
-// splits and values Tensors of the `RaggedTensor`. Then the 1-D `variant` Tensor
-// is wrapped in a scalar `variant` Tensor. See `RaggedTensorFromVariant` for the
-// corresponding decoding logic.
-//
-//
-// Arguments:
-//	rt_nested_splits: A list of one or more Tensors representing the splits of the input
-// `RaggedTensor`.
-//	rt_dense_values: A Tensor representing the values of the input `RaggedTensor`.
-//	batched_input: A `bool` denoting whether the input is a batched `RaggedTensor`.
-//
-// Returns A `variant` Tensor that containing encoded `RaggedTensor`.
-func RaggedTensorToVariant(scope *Scope, rt_nested_splits []tf.Output, rt_dense_values tf.Output, batched_input bool) (encoded_ragged tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"batched_input": batched_input}
-	opspec := tf.OpSpec{
-		Type: "RaggedTensorToVariant",
-		Input: []tf.Input{
-			tf.OutputList(rt_nested_splits), rt_dense_values,
+			input_dataset, path,
 		},
 		Attrs: attrs,
 	}
@@ -11286,1233 +17373,142 @@ func RaggedTensorToVariant(scope *Scope, rt_nested_splits []tf.Output, rt_dense_
 	return op.Output(0)
 }
 
-// Converts one or more images from RGB to HSV.
+// Returns a diagonal tensor with a given diagonal values.
 //
-// Outputs a tensor of the same shape as the `images` tensor, containing the HSV
-// value of the pixels. The output is only well defined if the value in `images`
-// are in `[0,1]`.
+// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+// everything else padded with zeros. The diagonal is computed as follows:
 //
-// `output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
-// `output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
-// corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
+// Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
+// rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
 //
-// Arguments:
-//	images: 1-D or higher rank. RGB data to convert. Last dimension must be size 3.
+// `output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
 //
-// Returns `images` converted to HSV.
-func RGBToHSV(scope *Scope, images tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RGBToHSV",
-		Input: []tf.Input{
-			images,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RaggedRangeAttr is an optional argument to RaggedRange.
-type RaggedRangeAttr func(optionalAttr)
-
-// RaggedRangeTsplits sets the optional Tsplits attribute to value.
-// If not specified, defaults to DT_INT64
-func RaggedRangeTsplits(value tf.DataType) RaggedRangeAttr {
-	return func(m optionalAttr) {
-		m["Tsplits"] = value
-	}
-}
-
-// Returns a `RaggedTensor` containing the specified sequences of numbers.
-//
-//
-// Returns a `RaggedTensor` `result` composed from `rt_dense_values` and
-// `rt_nested_splits`, such that
-// `result[i] = range(starts[i], limits[i], deltas[i])`.
-//
-// ```python
-// >>> (rt_nested_splits, rt_dense_values) = gen_ragged_ops.ragged_range(
-// ...     starts=[2, 5, 8], limits=[3, 5, 12], deltas=1)
-// >>> result = ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
-// >>> print result.eval().tolist()
-// [[2],               # result[0] = range(2, 3)
-//  [],                # result[1] = range(5, 5)
-//  [8, 9, 10, 11]]    # result[2] = range(8, 12)
-// ```
-//
-// The input tensors `starts`, `limits`, and `deltas` may be scalars or vectors.
-// The vector inputs must all have the same size.  Scalar inputs are broadcast
-// to match the size of the vector inputs.
-//
-// Arguments:
-//	starts: The starts of each range.
-//	limits: The limits of each range.
-//	deltas: The deltas of each range.
-//
-// Returns The `row_splits` for the returned `RaggedTensor`.The `flat_values` for the returned `RaggedTensor`.
-func RaggedRange(scope *Scope, starts tf.Output, limits tf.Output, deltas tf.Output, optional ...RaggedRangeAttr) (rt_nested_splits tf.Output, rt_dense_values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RaggedRange",
-		Input: []tf.Input{
-			starts, limits, deltas,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// RandomGammaAttr is an optional argument to RandomGamma.
-type RandomGammaAttr func(optionalAttr)
-
-// RandomGammaSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomGammaSeed(value int64) RandomGammaAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomGammaSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomGammaSeed2(value int64) RandomGammaAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from the Gamma distribution(s) described by alpha.
-//
-// This op uses the algorithm by Marsaglia et al. to acquire samples via
-// transformation-rejection from pairs of uniform and normal random variables.
-// See http://dl.acm.org/citation.cfm?id=358414
-//
-// Arguments:
-//	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in alpha.
-//	alpha: A tensor in which each scalar is a "shape" parameter describing the
-// associated gamma distribution.
-//
-// Returns A tensor with shape `shape + shape(alpha)`. Each slice
-// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
-func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomGamma",
-		Input: []tf.Input{
-			shape, alpha,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Says whether the targets are in the top `K` predictions.
-//
-// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-// prediction for the target class is among the top `k` predictions among
-// all predictions for example `i`. Note that the behavior of `InTopK` differs
-// from the `TopK` op in its handling of ties; if multiple classes have the
-// same prediction value and straddle the top-`k` boundary, all of those
-// classes are considered to be in the top `k`.
-//
-// More formally, let
-//
-//   \\(predictions_i\\) be the predictions for all classes for example `i`,
-//   \\(targets_i\\) be the target class for example `i`,
-//   \\(out_i\\) be the output for example `i`,
-//
-// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
-//
-// Arguments:
-//	predictions: A `batch_size` x `classes` tensor.
-//	targets: A `batch_size` vector of class ids.
-//	k: Number of top elements to look at for computing precision.
-//
-// Returns Computed Precision at `k` as a `bool Tensor`.
-func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (precision tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"k": k}
-	opspec := tf.OpSpec{
-		Type: "InTopK",
-		Input: []tf.Input{
-			predictions, targets,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
-type DepthwiseConv2dNativeAttr func(optionalAttr)
-
-// DepthwiseConv2dNativeDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// DepthwiseConv2dNativeDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
-//
-// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-// and a filter / kernel tensor of shape
-// `[filter_height, filter_width, in_channels, channel_multiplier]`, containing
-// `in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
-// a different filter to each input channel (expanding from 1 channel to
-// `channel_multiplier` channels for each), then concatenates the results
-// together. Thus, the output has `in_channels * channel_multiplier` channels.
+// For example:
 //
 // ```
-// for k in 0..in_channels-1
-//   for q in 0..channel_multiplier-1
-//     output[b, i, j, k * channel_multiplier + q] =
-//       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
-//                         filter[di, dj, k, q]
-// ```
-//
-// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
-//
-// Arguments:
-//
-//
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`.
-//	padding: The type of padding algorithm to use.
-func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNative",
-		Input: []tf.Input{
-			input, filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RequantizePerChannelAttr is an optional argument to RequantizePerChannel.
-type RequantizePerChannelAttr func(optionalAttr)
-
-// RequantizePerChannelOutType sets the optional out_type attribute to value.
-//
-// value: The quantized type of output tensor that needs to be converted.
-// If not specified, defaults to DT_QUINT8
-func RequantizePerChannelOutType(value tf.DataType) RequantizePerChannelAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Requantizes input with min and max values known per channel.
-//
-// Arguments:
-//	input: The original input tensor.
-//	input_min: The minimum value of the input tensor
-//	input_max: The maximum value of the input tensor.
-//	requested_output_min: The minimum value of the output tensor requested.
-//	requested_output_max: The maximum value of the output tensor requested.
-//
-// Returns Output tensor.The minimum value of the final output tensorThe maximum value of the final output tensor.
-func RequantizePerChannel(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, optional ...RequantizePerChannelAttr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RequantizePerChannel",
-		Input: []tf.Input{
-			input, input_min, input_max, requested_output_min, requested_output_max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// MaxPool3DAttr is an optional argument to MaxPool3D.
-type MaxPool3DAttr func(optionalAttr)
-
-// MaxPool3DDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DDataFormat(value string) MaxPool3DAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs 3D max pooling on the input.
-//
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The max pooled output tensor.
-func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPool3D",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// 2D real-valued fast Fourier transform.
-//
-// Computes the 2-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most 2 dimensions of `input`.
-//
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
-// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-// of `output`: the zero-frequency term, followed by the `fft_length / 2`
-// positive-frequency terms.
-//
-// Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
-//
-// Returns A complex64 tensor of the same rank as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their 2D Fourier transform. The
-//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
-//   components.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.rfft2
-// @end_compatibility
-func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RFFT2D",
-		Input: []tf.Input{
-			input, fft_length,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RandomUniformIntAttr is an optional argument to RandomUniformInt.
-type RandomUniformIntAttr func(optionalAttr)
-
-// RandomUniformIntSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random integers from a uniform distribution.
-//
-// The generated values are uniform integers in the range `[minval, maxval)`.
-// The lower bound `minval` is included in the range, while the upper bound
-// `maxval` is excluded.
-//
-// The random integers are slightly biased unless `maxval - minval` is an exact
-// power of two.  The bias is small for values of `maxval - minval` significantly
-// smaller than the range of the output (either `2^32` or `2^64`).
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	minval: 0-D.  Inclusive lower bound on the generated integers.
-//	maxval: 0-D.  Exclusive upper bound on the generated integers.
-//
-// Returns A tensor of the specified shape filled with uniform random integers.
-func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomUniformInt",
-		Input: []tf.Input{
-			shape, minval, maxval,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns x / y element-wise for integer types.
-//
-// Truncation designates that negative numbers will round fractional quantities
-// toward zero. I.e. -7 / 5 = -1. This matches C semantics but it is different
-// than Python semantics. See `FloorDiv` for a division function that matches
-// Python Semantics.
-//
-// *NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TruncateDiv",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
-type CropAndResizeGradImageAttr func(optionalAttr)
-
-// CropAndResizeGradImageMethod sets the optional method attribute to value.
-//
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
-	return func(m optionalAttr) {
-		m["method"] = value
-	}
-}
-
-// Computes the gradient of the crop_and_resize op wrt the input image tensor.
-//
-// Arguments:
-//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
-// containing the original image size. Both `image_height` and `image_width` need
-// to be positive.
-//
-//
-// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"T": T}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CropAndResizeGradImage",
-		Input: []tf.Input{
-			grads, boxes, box_ind, image_size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MaxPoolV2Attr is an optional argument to MaxPoolV2.
-type MaxPoolV2Attr func(optionalAttr)
-
-// MaxPoolV2DataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs max pooling on the input.
-//
-// Arguments:
-//	input: 4-D input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The max pooled output tensor.
-func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolV2",
-		Input: []tf.Input{
-			input, ksize, strides,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns x * y element-wise.
-//
-// *NOTE*: `Multiply` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Mul",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Advance the counter of a counter-based RNG.
-//
-// The state of the RNG after
-// `rng_skip(n)` will be the same as that after `stateful_uniform([n])`
-// (or any other distribution). The actual increment added to the
-// counter is an unspecified implementation detail.
-//
-// Arguments:
-//	resource: The handle of the resource variable that stores the state of the RNG.
-//	algorithm: The RNG algorithm.
-//	delta: The amount of advancement.
-//
-// Returns the created operation.
-func RngSkip(scope *Scope, resource tf.Output, algorithm tf.Output, delta tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RngSkip",
-		Input: []tf.Input{
-			resource, algorithm, delta,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Says whether the targets are in the top `K` predictions.
-//
-// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-// prediction for the target class is among the top `k` predictions among
-// all predictions for example `i`. Note that the behavior of `InTopK` differs
-// from the `TopK` op in its handling of ties; if multiple classes have the
-// same prediction value and straddle the top-`k` boundary, all of those
-// classes are considered to be in the top `k`.
-//
-// More formally, let
-//
-//   \\(predictions_i\\) be the predictions for all classes for example `i`,
-//   \\(targets_i\\) be the target class for example `i`,
-//   \\(out_i\\) be the output for example `i`,
-//
-// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
-//
-// Arguments:
-//	predictions: A `batch_size` x `classes` tensor.
-//	targets: A `batch_size` vector of class ids.
-//	k: Number of top elements to look at for computing precision.
-//
-// Returns Computed precision at `k` as a `bool Tensor`.
-func InTopKV2(scope *Scope, predictions tf.Output, targets tf.Output, k tf.Output) (precision tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InTopKV2",
-		Input: []tf.Input{
-			predictions, targets, k,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StatefulTruncatedNormalAttr is an optional argument to StatefulTruncatedNormal.
-type StatefulTruncatedNormalAttr func(optionalAttr)
-
-// StatefulTruncatedNormalDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatefulTruncatedNormalDtype(value tf.DataType) StatefulTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs random values from a truncated normal distribution.
-//
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
-//
-// Arguments:
-//	resource: The handle of the resource variable that stores the state of the RNG.
-//	algorithm: The RNG algorithm.
-//	shape: The shape of the output tensor.
-//
-// Returns Random values with specified shape.
-func StatefulTruncatedNormal(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulTruncatedNormalAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatefulTruncatedNormal",
-		Input: []tf.Input{
-			resource, algorithm, shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StatefulUniformAttr is an optional argument to StatefulUniform.
-type StatefulUniformAttr func(optionalAttr)
-
-// StatefulUniformDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatefulUniformDtype(value tf.DataType) StatefulUniformAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs random values from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
-//
-// Arguments:
-//	resource: The handle of the resource variable that stores the state of the RNG.
-//	algorithm: The RNG algorithm.
-//	shape: The shape of the output tensor.
-//
-// Returns Random values with specified shape.
-func StatefulUniform(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulUniformAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatefulUniform",
-		Input: []tf.Input{
-			resource, algorithm, shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RandomShuffleAttr is an optional argument to RandomShuffle.
-type RandomShuffleAttr func(optionalAttr)
-
-// RandomShuffleSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomShuffleSeed(value int64) RandomShuffleAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomShuffleSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomShuffleSeed2(value int64) RandomShuffleAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Randomly shuffles a tensor along its first dimension.
-//
-//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
-//   to one and only one `output[i]`. For example, a mapping that might occur for a
-//   3x2 tensor is:
-//
-// ```
-// [[1, 2],       [[5, 6],
-//  [3, 4],  ==>   [1, 2],
-//  [5, 6]]        [3, 4]]
+// # 'diagonal' is [1, 2, 3, 4]
+// tf.diag(diagonal) ==> [[1, 0, 0, 0]
+//                        [0, 2, 0, 0]
+//                        [0, 0, 3, 0]
+//                        [0, 0, 0, 4]]
 // ```
 //
 // Arguments:
-//	value: The tensor to be shuffled.
-//
-// Returns A tensor of same shape and type as `value`, shuffled along its first
-// dimension.
-func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomShuffle",
-		Input: []tf.Input{
-			value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Produces the average pool of the input tensor for quantized types.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, height, width, channels]`.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	ksize: The size of the window for each dimension of the input tensor.
-// The length must be 4 to match the number of dimensions of the input.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor.  The length must be 4 to match the number of dimensions of the input.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedAvgPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "QuantizedAvgPool",
-		Input: []tf.Input{
-			input, min_input, max_input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Performs a padding as a preprocess during a convolution.
-//
-// Similar to FusedResizeAndPadConv2d, this op allows for an optimized
-// implementation where the spatial padding transformation stage is fused with the
-// im2col lookup, but in this case without the bilinear filtering required for
-// resizing. Fusing the padding prevents the need to write out the intermediate
-// results as whole tensors, reducing memory pressure, and we can get some latency
-// gains by merging the transformation calculations.
-// The data_format attribute for Conv2D isn't supported by this op, and 'NHWC'
-// order is used instead.
-// Internally this op uses a single per-graph scratch buffer, which means that it
-// will block if multiple versions are being run in parallel. This is because this
-// operator is primarily an optimization to minimize memory usage.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
-//
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`. Must be in the same order as the dimension specified with format.
-//	padding: The type of padding algorithm to use.
-func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "FusedPadConv2D",
-		Input: []tf.Input{
-			input, paddings, filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Execute a sub graph on a remote processor.
-//
-// The graph specifications(such as graph itself, input tensors and output names)
-// are stored as a serialized protocol buffer of RemoteFusedGraphExecuteInfo
-// as serialized_remote_fused_graph_execute_info.
-// The specifications will be passed to a dedicated registered
-// remote fused graph executor.  The executor will send the graph specifications
-// to a remote processor and execute that graph.  The execution results
-// will be passed to consumer nodes as outputs of this node.
-//
-// Arguments:
-//	inputs: Arbitrary number of tensors with arbitrary data types
-//
-//	serialized_remote_fused_graph_execute_info: Serialized protocol buffer
-// of RemoteFusedGraphExecuteInfo which contains graph specifications.
-//
-// Returns Arbitrary number of tensors with arbitrary data types
-func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.DataType, serialized_remote_fused_graph_execute_info string) (outputs []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"Toutputs": Toutputs, "serialized_remote_fused_graph_execute_info": serialized_remote_fused_graph_execute_info}
-	opspec := tf.OpSpec{
-		Type: "RemoteFusedGraphExecute",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("RemoteFusedGraphExecute", err)
-		return
-	}
-	return outputs
-}
-
-// Locks a mutex resource.  The output is the lock.  So long as the lock tensor
-//
-// is alive, any other request to use `MutexLock` with this mutex will wait.
-//
-// This is particularly useful for creating a critical section when used in
-// conjunction with `MutexLockIdentity`:
-//
-// ```python
-//
-// mutex = mutex_v2(
-//   shared_name=handle_name, container=container, name=name)
-//
-// def execute_in_critical_section(fn, *args, **kwargs):
-//   lock = gen_resource_variable_ops.mutex_lock(mutex)
-//
-//   with ops.control_dependencies([lock]):
-//     r = fn(*args, **kwargs)
-//
-//   with ops.control_dependencies(nest.flatten(r)):
-//     with ops.colocate_with(mutex):
-//       ensure_lock_exists = mutex_lock_identity(lock)
-//
-//     # Make sure that if any element of r is accessed, all of
-//     # them are executed together.
-//     r = nest.map_structure(tf.identity, r)
-//
-//   with ops.control_dependencies([ensure_lock_exists]):
-//     return nest.map_structure(tf.identity, r)
-// ```
-//
-// While `fn` is running in the critical section, no other functions which wish to
-// use this critical section may run.
-//
-// Often the use case is that two executions of the same graph, in parallel,
-// wish to run `fn`; and we wish to ensure that only one of them executes
-// at a time.  This is especially important if `fn` modifies one or more
-// variables at a time.
-//
-// It is also useful if two separate functions must share a resource, but we
-// wish to ensure the usage is exclusive.
-//
-// Arguments:
-//	mutex: The mutex resource to lock.
-//
-// Returns A tensor that keeps a shared pointer to a lock on the mutex;
-// when the Tensor is destroyed, the use count on the shared pointer is decreased
-// by 1.  When it reaches 0, the lock is released.
-func MutexLock(scope *Scope, mutex tf.Output) (mutex_lock tf.Output) {
+//	diagonal: Rank k tensor where k is at most 1.
+func Diag(scope *Scope, diagonal tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MutexLock",
+		Type: "Diag",
 		Input: []tf.Input{
-			mutex,
+			diagonal,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Merges summaries.
-//
-// This op creates a
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// protocol buffer that contains the union of all the values in the input
-// summaries.
-//
-// When the Op is run, it reports an `InvalidArgument` error if multiple values
-// in the summaries to merge use the same tag.
-//
-// Arguments:
-//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
-// buffers.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MergeSummary",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// LoadTPUEmbeddingAdagradParametersAttr is an optional argument to LoadTPUEmbeddingAdagradParameters.
+type LoadTPUEmbeddingAdagradParametersAttr func(optionalAttr)
 
-// Assigns sparse updates to the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] = updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] = updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
-//
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
-//
-// Returns the created operation.
-func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterUpdate",
-		Input: []tf.Input{
-			resource, indices, updates,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Multiplies sparse updates into the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] *= updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] *= updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions multiply.
-//
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
-//
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
-//
-// Returns the created operation.
-func ResourceScatterMul(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterMul",
-		Input: []tf.Input{
-			resource, indices, updates,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Connects N inputs to an N-way replicated TPU computation.
-func TPUReplicatedInput(scope *Scope, inputs []tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TPUReplicatedInput",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Subtracts sparse updates from the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] -= updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] -= updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions add.
-//
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
-//
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
-//
-// Returns the created operation.
-func ResourceScatterSub(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterSub",
-		Input: []tf.Input{
-			resource, indices, updates,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Adds sparse updates to the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] += updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] += updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions add.
-//
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
-//
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
-//
-// Returns the created operation.
-func ResourceScatterAdd(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterAdd",
-		Input: []tf.Input{
-			resource, indices, updates,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Counts the number of occurrences of each value in an integer array.
-//
-// Outputs a vector with length `size` and the same dtype as `weights`. If
-// `weights` are empty, then index `i` stores the number of times the value `i` is
-// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
-// the value in `weights` at each index where the corresponding value in `arr` is
-// `i`.
-//
-// Values in `arr` outside of the range [0, size) are ignored.
-//
-// Arguments:
-//	arr: int32 `Tensor`.
-//	size: non-negative int32 scalar `Tensor`.
-//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
-// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
-// equal to 1.
-//
-// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
-// each value in the range [0, size).
-func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Bincount",
-		Input: []tf.Input{
-			arr, size, weights,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// InfeedEnqueuePrelinearizedBufferAttr is an optional argument to InfeedEnqueuePrelinearizedBuffer.
-type InfeedEnqueuePrelinearizedBufferAttr func(optionalAttr)
-
-// InfeedEnqueuePrelinearizedBufferDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. This should be -1 when the Op is running on a TPU device
-// and = 0 when the Op is running on the CPU device.
+// LoadTPUEmbeddingAdagradParametersTableId sets the optional table_id attribute to value.
 // If not specified, defaults to -1
-func InfeedEnqueuePrelinearizedBufferDeviceOrdinal(value int64) InfeedEnqueuePrelinearizedBufferAttr {
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingAdagradParametersTableId(value int64) LoadTPUEmbeddingAdagradParametersAttr {
 	return func(m optionalAttr) {
-		m["device_ordinal"] = value
+		m["table_id"] = value
 	}
 }
 
-// An op which enqueues prelinearized buffer into TPU infeed.
+// LoadTPUEmbeddingAdagradParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdagradParametersTableName(value string) LoadTPUEmbeddingAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load Adagrad embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
 // Arguments:
-//	input: A variant tensor representing linearized output.
+//	parameters: Value of parameters used in the Adagrad optimization algorithm.
+//	accumulators: Value of accumulators used in the Adagrad optimization algorithm.
+//
+//
 //
 // Returns the created operation.
-func InfeedEnqueuePrelinearizedBuffer(scope *Scope, input tf.Output, optional ...InfeedEnqueuePrelinearizedBufferAttr) (o *tf.Operation) {
+func LoadTPUEmbeddingAdagradParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdagradParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingAdagradParameters",
+		Input: []tf.Input{
+			parameters, accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
+type ResourceApplyAdamAttr func(optionalAttr)
+
+// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, uses the nesterov update.
+// If not specified, defaults to false
+func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update '*var' according to the Adam algorithm.
+//
+// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+// $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	beta2_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12521,188 +17517,22 @@ func InfeedEnqueuePrelinearizedBuffer(scope *Scope, input tf.Output, optional ..
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "InfeedEnqueuePrelinearizedBuffer",
+		Type: "ResourceApplyAdam",
 		Input: []tf.Input{
-			input,
+			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Checks whether a resource handle-based variable has been initialized.
-//
-// Arguments:
-//	resource: the input resource handle.
-//
-// Returns a scalar boolean which is true if the variable has been
-// initialized.
-func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) {
+// Returns the truth value of NOT x element-wise.
+func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "VarIsInitializedOp",
-		Input: []tf.Input{
-			resource,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Subtracts a value from the current value of a variable.
-//
-// Any ReadVariableOp with a control dependency on this op is guaranteed to
-// see the decremented value or a subsequent newer one.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
-//
-// Returns the created operation.
-func AssignSubVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AssignSubVariableOp",
-		Input: []tf.Input{
-			resource, value,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// SubstrAttr is an optional argument to Substr.
-type SubstrAttr func(optionalAttr)
-
-// SubstrUnit sets the optional unit attribute to value.
-//
-// value: The unit that is used to create the substring.  One of: `"BYTE"` (for
-// defining position and length by bytes) or `"UTF8_CHAR"` (for the UTF-8
-// encoded Unicode code points).  The default is `"BYTE"`. Results are undefined if
-// `unit=UTF8_CHAR` and the `input` strings do not contain structurally valid
-// UTF-8.
-// If not specified, defaults to "BYTE"
-func SubstrUnit(value string) SubstrAttr {
-	return func(m optionalAttr) {
-		m["unit"] = value
-	}
-}
-
-// Return substrings from `Tensor` of strings.
-//
-// For each string in the input `Tensor`, creates a substring starting at index
-// `pos` with a total length of `len`.
-//
-// If `len` defines a substring that would extend beyond the length of the input
-// string, then as many characters as possible are used.
-//
-// A negative `pos` indicates distance within the string backwards from the end.
-//
-// If `pos` specifies an index which is out of range for any of the input strings,
-// then an `InvalidArgumentError` is thrown.
-//
-// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
-// Op creation.
-//
-// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
-// broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-//
-// ---
-//
-// Examples
-//
-// Using scalar `pos` and `len`:
-//
-// ```python
-// input = [b'Hello', b'World']
-// position = 1
-// length = 3
-//
-// output = [b'ell', b'orl']
-// ```
-//
-// Using `pos` and `len` with same shape as `input`:
-//
-// ```python
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen']]
-// position = [[1, 2, 3],
-//             [1, 2, 3],
-//             [1, 2, 3]]
-// length =   [[2, 3, 4],
-//             [4, 3, 2],
-//             [5, 5, 5]]
-//
-// output = [[b'en', b'eve', b'lve'],
-//           [b'hirt', b'urt', b'te'],
-//           [b'ixtee', b'vente', b'hteen']]
-// ```
-//
-// Broadcasting `pos` and `len` onto `input`:
-//
-// ```
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen'],
-//          [b'nineteen', b'twenty', b'twentyone']]
-// position = [1, 2, 3]
-// length =   [1, 2, 3]
-//
-// output = [[b'e', b'ev', b'lve'],
-//           [b'h', b'ur', b'tee'],
-//           [b'i', b've', b'hte'],
-//           [b'i', b'en', b'nty']]
-// ```
-//
-// Broadcasting `input` onto `pos` and `len`:
-//
-// ```
-// input = b'thirteen'
-// position = [1, 5, 7]
-// length =   [3, 2, 1]
-//
-// output = [b'hir', b'ee', b'n']
-// ```
-//
-// Arguments:
-//	input: Tensor of strings
-//	pos: Scalar defining the position of first character in each substring
-//	len: Scalar defining the number of characters to include in each substring
-//
-// Returns Tensor of substrings
-func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output, optional ...SubstrAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Substr",
-		Input: []tf.Input{
-			input, pos, len,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes reciprocal of square root of x element-wise.
-//
-// I.e., \\(y = 1 / \sqrt{x}\\).
-func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Rsqrt",
+		Type: "LogicalNot",
 		Input: []tf.Input{
 			x,
 		},
@@ -12711,60 +17541,583 @@ func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Adds a value to the current value of a variable.
+// Computes the reciprocal of x element-wise.
 //
-// Any ReadVariableOp with a control dependency on this op is guaranteed to
-// see the incremented value or a subsequent newer one.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
-//
-// Returns the created operation.
-func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+// I.e., \\(y = 1 / x\\).
+func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AssignAddVariableOp",
+		Type: "Reciprocal",
 		Input: []tf.Input{
-			resource, value,
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LogUniformCandidateSamplerAttr is an optional argument to LogUniformCandidateSampler.
+type LogUniformCandidateSamplerAttr func(optionalAttr)
+
+// LogUniformCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func LogUniformCandidateSamplerSeed(value int64) LogUniformCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// LogUniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func LogUniformCandidateSamplerSeed2(value int64) LogUniformCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a log-uniform distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func LogUniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LogUniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LogUniformCandidateSampler",
+		Input: []tf.Input{
+			true_classes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Computes the minimum along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the min is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
+// </div>
+//
+// For example:
+//
+// ```
+// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+// tf.segment_min(c, tf.constant([0, 0, 1]))
+// # ==> [[1, 2, 2, 1],
+// #      [5, 6, 7, 8]]
+// ```
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentMin",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizedMulAttr is an optional argument to QuantizedMul.
+type QuantizedMulAttr func(optionalAttr)
+
+// QuantizedMulToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMulToutput(value tf.DataType) QuantizedMulAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// Returns x * y element-wise, working on quantized buffers.
+//
+// Arguments:
+//
+//
+//	min_x: The float value that the lowest quantized `x` value represents.
+//	max_x: The float value that the highest quantized `x` value represents.
+//	min_y: The float value that the lowest quantized `y` value represents.
+//	max_y: The float value that the highest quantized `y` value represents.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+//
+// *NOTE*: `QuantizedMul` supports limited forms of broadcasting. More about
+// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func QuantizedMul(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedMulAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedMul",
+		Input: []tf.Input{
+			x, y, min_x, max_x, min_y, max_y,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
+//
+// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
+// ](http://arxiv.org/abs/1511.07289)
+func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Elu",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Constructs an Optional variant from a tuple of tensors.
+func OptionalFromValue(scope *Scope, components []tf.Output) (optional tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "OptionalFromValue",
+		Input: []tf.Input{
+			tf.OutputList(components),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes fingerprints of the input strings.
+//
+// Arguments:
+//	input: vector of strings to compute fingerprints on.
+//
+// Returns a (N,2) shaped matrix where N is the number of elements in the input
+// vector. Each row contains the low and high parts of the fingerprint.
+func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SdcaFprint",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deserialize bucket boundaries and ready flag into current QuantileAccumulator.
+//
+// An op that deserializes bucket boundaries and are boundaries ready flag into current QuantileAccumulator.
+//
+// Arguments:
+//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
+//	bucket_boundaries: float; List of Rank 1 Tensors each containing the bucket boundaries for a feature.
+//
+// Returns the created operation.
+func BoostedTreesQuantileStreamResourceDeserialize(scope *Scope, quantile_stream_resource_handle tf.Output, bucket_boundaries []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesQuantileStreamResourceDeserialize",
+		Input: []tf.Input{
+			quantile_stream_resource_handle, tf.OutputList(bucket_boundaries),
 		},
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Draw bounding boxes on a batch of images.
-//
-// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
-// boxes specified by the locations in `boxes`. The coordinates of the each
-// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example, if an image is 100 x 200 pixels (height x width) and the bounding
-// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
-// the bounding box will be `(40, 10)` to `(100, 50)` (in (x,y) coordinates).
-//
-// Parts of the bounding box may fall outside the image.
+// Creates a dataset that batches input elements into a SparseTensor.
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
-//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
-// boxes.
-//	colors: 2-D. A list of RGBA colors to cycle through for the boxes.
+//	input_dataset: A handle to an input dataset. Must have a single component.
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	row_shape: A vector representing the dense shape of each row in the produced
+// SparseTensor. The shape may be partially specified, using `-1` to indicate
+// that a particular dimension should use the maximum size of all batch elements.
 //
-// Returns 4-D with the same shape as `images`. The batch of input images with
-// bounding boxes drawn on the images.
-func DrawBoundingBoxesV2(scope *Scope, images tf.Output, boxes tf.Output, colors tf.Output) (output tf.Output) {
+//
+func ExperimentalDenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, row_shape tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "DrawBoundingBoxesV2",
+		Type: "ExperimentalDenseToSparseBatchDataset",
 		Input: []tf.Input{
-			images, boxes, colors,
+			input_dataset, batch_size, row_shape,
 		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TPUReplicateMetadataAttr is an optional argument to TPUReplicateMetadata.
+type TPUReplicateMetadataAttr func(optionalAttr)
+
+// TPUReplicateMetadataNumCoresPerReplica sets the optional num_cores_per_replica attribute to value.
+//
+// value: Number of cores per replica. Used for model parallelism.
+// If not specified, defaults to 1
+func TPUReplicateMetadataNumCoresPerReplica(value int64) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["num_cores_per_replica"] = value
+	}
+}
+
+// TPUReplicateMetadataTopology sets the optional topology attribute to value.
+//
+// value: TopologyProto indicating the topology of the TPU pod slice.
+// If not specified, defaults to ""
+func TPUReplicateMetadataTopology(value string) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["topology"] = value
+	}
+}
+
+// TPUReplicateMetadataUseTpu sets the optional use_tpu attribute to value.
+//
+// value: Whether to place the computation on the TPU.
+// If not specified, defaults to true
+func TPUReplicateMetadataUseTpu(value bool) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["use_tpu"] = value
+	}
+}
+
+// TPUReplicateMetadataDeviceAssignment sets the optional device_assignment attribute to value.
+//
+// value: The assignment of devices for the computation.
+// If not specified, defaults to <>
+func TPUReplicateMetadataDeviceAssignment(value []int64) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["device_assignment"] = value
+	}
+}
+
+// TPUReplicateMetadataComputationShape sets the optional computation_shape attribute to value.
+//
+// value: DEPRECATED. Use num_cores_per_replica instead.
+// If not specified, defaults to <>
+func TPUReplicateMetadataComputationShape(value []int64) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["computation_shape"] = value
+	}
+}
+
+// TPUReplicateMetadataHostComputeCore sets the optional host_compute_core attribute to value.
+// If not specified, defaults to <>
+func TPUReplicateMetadataHostComputeCore(value []string) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["host_compute_core"] = value
+	}
+}
+
+// TPUReplicateMetadataPaddingMap sets the optional padding_map attribute to value.
+// If not specified, defaults to <>
+func TPUReplicateMetadataPaddingMap(value []string) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["padding_map"] = value
+	}
+}
+
+// TPUReplicateMetadataStepMarkerLocation sets the optional step_marker_location attribute to value.
+// If not specified, defaults to "STEP_MARK_AT_ENTRY"
+func TPUReplicateMetadataStepMarkerLocation(value string) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["step_marker_location"] = value
+	}
+}
+
+// Metadata indicaitng how the TPU computation should be replicated.
+//
+// Arguments:
+//	num_replicas: Number of replicas of the computation
+//
+// Returns the created operation.
+func TPUReplicateMetadata(scope *Scope, num_replicas int64, optional ...TPUReplicateMetadataAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_replicas": num_replicas}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TPUReplicateMetadata",
+
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.
+type RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve Momentum embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the Momentum optimization algorithm.Parameter momenta updated by the Momentum optimization algorithm.Parameter gradient_accumulators updated by the Momentum optimization algorithm.
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// ConfigureDistributedTPUAttr is an optional argument to ConfigureDistributedTPU.
+type ConfigureDistributedTPUAttr func(optionalAttr)
+
+// ConfigureDistributedTPUEmbeddingConfig sets the optional embedding_config attribute to value.
+//
+// value: Reserved. Do not use.
+// If not specified, defaults to ""
+func ConfigureDistributedTPUEmbeddingConfig(value string) ConfigureDistributedTPUAttr {
+	return func(m optionalAttr) {
+		m["embedding_config"] = value
+	}
+}
+
+// ConfigureDistributedTPUTpuEmbeddingConfig sets the optional tpu_embedding_config attribute to value.
+//
+// value: Serialized tensorflow.tpu.TPUEmbeddingConfiguration that
+// describes the embedding lookups of the program.
+// If not specified, defaults to ""
+func ConfigureDistributedTPUTpuEmbeddingConfig(value string) ConfigureDistributedTPUAttr {
+	return func(m optionalAttr) {
+		m["tpu_embedding_config"] = value
+	}
+}
+
+// ConfigureDistributedTPUIsGlobalInit sets the optional is_global_init attribute to value.
+//
+// value: Reserved. Do not use.
+// If not specified, defaults to false
+func ConfigureDistributedTPUIsGlobalInit(value bool) ConfigureDistributedTPUAttr {
+	return func(m optionalAttr) {
+		m["is_global_init"] = value
+	}
+}
+
+// Sets up the centralized structures for a distributed TPU system.
+//
+// Returns A serialized tensorflow.tpu.TopologyProto that describes the TPU
+// topology.
+func ConfigureDistributedTPU(scope *Scope, optional ...ConfigureDistributedTPUAttr) (topology tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ConfigureDistributedTPU",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
+type ResourceSparseApplyRMSPropAttr func(optionalAttr)
+
+// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the RMSProp algorithm.
+//
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
+//
+// Returns the created operation.
+func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyRMSProp",
+		Input: []tf.Input{
+			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// EmptyAttr is an optional argument to Empty.
+type EmptyAttr func(optionalAttr)
+
+// EmptyInit sets the optional init attribute to value.
+//
+// value: If True, initialize the returned tensor with the default value of dtype.  Otherwise, the implementation is free not to initializethe tensor's content.
+// If not specified, defaults to false
+func EmptyInit(value bool) EmptyAttr {
+	return func(m optionalAttr) {
+		m["init"] = value
+	}
+}
+
+// Creates a tensor with the given shape.
+//
+// This operation creates a tensor of `shape` and `dtype`.
+//
+// Arguments:
+//	shape: 1-D. Represents the shape of the output tensor.
+//
+//
+// Returns A `Tensor` of type `T`.
+func Empty(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...EmptyAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Empty",
+		Input: []tf.Input{
+			shape,
+		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -12889,1307 +18242,6 @@ func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, o
 	return op.Output(0)
 }
 
-// StatefulUniformFullIntAttr is an optional argument to StatefulUniformFullInt.
-type StatefulUniformFullIntAttr func(optionalAttr)
-
-// StatefulUniformFullIntDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_UINT64
-func StatefulUniformFullIntDtype(value tf.DataType) StatefulUniformFullIntAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs random integers from a uniform distribution.
-//
-// The generated values are uniform integers covering the whole range of `dtype`.
-//
-// Arguments:
-//	resource: The handle of the resource variable that stores the state of the RNG.
-//	algorithm: The RNG algorithm.
-//	shape: The shape of the output tensor.
-//
-// Returns Random values with specified shape.
-func StatefulUniformFullInt(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulUniformFullIntAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatefulUniformFullInt",
-		Input: []tf.Input{
-			resource, algorithm, shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// A dataset that splits the elements of its input into multiple elements.
-func ExperimentalUnbatchDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalUnbatchDataset",
-		Input: []tf.Input{
-			input_dataset,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MutexV2Attr is an optional argument to MutexV2.
-type MutexV2Attr func(optionalAttr)
-
-// MutexV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this variable is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutexV2Container(value string) MutexV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MutexV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this variable is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func MutexV2SharedName(value string) MutexV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Creates a Mutex resource that can be locked by `MutexLock`.
-//
-// Returns The mutex resource.
-func MutexV2(scope *Scope, optional ...MutexV2Attr) (resource tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MutexV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// This op consumes a lock created by `MutexLock`.
-//
-// This op exists to consume a tensor created by `MutexLock` (other than
-// direct control dependencies).  It should be the only that consumes the tensor,
-// and will raise an error if it is not.  Its only purpose is to keep the
-// mutex lock tensor alive until it is consumed by this op.
-//
-// **NOTE**: This operation must run on the same device as its input.  This may
-// be enforced via the `colocate_with` mechanism.
-//
-// Arguments:
-//	mutex_lock: A tensor returned by `MutexLock`.
-//
-// Returns the created operation.
-func ConsumeMutexLock(scope *Scope, mutex_lock tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ConsumeMutexLock",
-		Input: []tf.Input{
-			mutex_lock,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Add all input tensors element wise.
-//
-// Arguments:
-//	inputs: Must all be the same size and shape.
-func AddN(scope *Scope, inputs []tf.Output) (sum tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AddN",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RandomPoissonAttr is an optional argument to RandomPoisson.
-type RandomPoissonAttr func(optionalAttr)
-
-// RandomPoissonSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func RandomPoissonSeed(value int64) RandomPoissonAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomPoissonSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func RandomPoissonSeed2(value int64) RandomPoissonAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Use RandomPoissonV2 instead.
-//
-// DEPRECATED at GraphDef version 25: Replaced by RandomPoissonV2
-func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomPoisson",
-		Input: []tf.Input{
-			shape, rate,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ComplexAttr is an optional argument to Complex.
-type ComplexAttr func(optionalAttr)
-
-// ComplexTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_COMPLEX64
-func ComplexTout(value tf.DataType) ComplexAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Converts two real numbers to a complex number.
-//
-// Given a tensor `real` representing the real part of a complex number, and a
-// tensor `imag` representing the imaginary part of a complex number, this
-// operation returns complex numbers elementwise of the form \\(a + bj\\), where
-// *a* represents the `real` part and *b* represents the `imag` part.
-//
-// The input tensors `real` and `imag` must have the same shape.
-//
-// For example:
-//
-// ```
-// # tensor 'real' is [2.25, 3.25]
-// # tensor `imag` is [4.75, 5.75]
-// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
-// ```
-func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Complex",
-		Input: []tf.Input{
-			real, imag,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the number of elements in the given queue.
-//
-// Arguments:
-//	handle: The handle to a queue.
-//
-// Returns The number of elements in the given queue.
-func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "QueueSizeV2",
-		Input: []tf.Input{
-			handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyAdamWithAmsgradAttr is an optional argument to ResourceApplyAdamWithAmsgrad.
-type ResourceApplyAdamWithAmsgradAttr func(optionalAttr)
-
-// ResourceApplyAdamWithAmsgradUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, m, and v tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdamWithAmsgradUseLocking(value bool) ResourceApplyAdamWithAmsgradAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the Adam algorithm.
-//
-// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
-// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
-// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-// $$vhat_t := max{vhat_{t-1}, v_t}$$
-// $$variable := variable - lr_t * m_t / (\sqrt{vhat_t} + \epsilon)$$
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	vhat: Should be from a Variable().
-//	beta1_power: Must be a scalar.
-//	beta2_power: Must be a scalar.
-//	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyAdamWithAmsgrad(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, vhat tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamWithAmsgradAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdamWithAmsgrad",
-		Input: []tf.Input{
-			var_, m, v, vhat, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes square root of x element-wise.
-//
-// I.e., \\(y = \sqrt{x} = x^{1/2}\\).
-func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sqrt",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// NthElementAttr is an optional argument to NthElement.
-type NthElementAttr func(optionalAttr)
-
-// NthElementReverse sets the optional reverse attribute to value.
-//
-// value: When set to True, find the nth-largest value in the vector and vice
-// versa.
-// If not specified, defaults to false
-func NthElementReverse(value bool) NthElementAttr {
-	return func(m optionalAttr) {
-		m["reverse"] = value
-	}
-}
-
-// Finds values of the `n`-th order statistic for the last dimension.
-//
-// If the input is a vector (rank-1), finds the entries which is the nth-smallest
-// value in the vector and outputs their values as scalar tensor.
-//
-// For matrices (resp. higher rank input), computes the entries which is the
-// nth-smallest value in each row (resp. vector along the last dimension). Thus,
-//
-//     values.shape = input.shape[:-1]
-//
-// Arguments:
-//	input: 1-D or higher with last dimension at least `n+1`.
-//	n: 0-D. Position of sorted vector to select along the last dimension (along
-// each row for matrices). Valid range of n is `[0, input.shape[:-1])`
-//
-// Returns The `n`-th order statistic along each last dimensional slice.
-func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthElementAttr) (values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "NthElement",
-		Input: []tf.Input{
-			input, n,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
-type SdcaOptimizerAttr func(optionalAttr)
-
-// SdcaOptimizerAdaptative sets the optional adaptative attribute to value.
-//
-// value: Whether to use Adaptive SDCA for the inner loop.
-// If not specified, defaults to true
-func SdcaOptimizerAdaptative(value bool) SdcaOptimizerAttr {
-	return func(m optionalAttr) {
-		m["adaptative"] = value
-	}
-}
-
-// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
-//
-// linear models with L1 + L2 regularization. As global optimization objective is
-// strongly-convex, the optimizer optimizes the dual objective at each step. The
-// optimizer applies each update one example at a time. Examples are sampled
-// uniformly, and the optimizer is learning rate free and enjoys linear convergence
-// rate.
-//
-// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
-// Shai Shalev-Shwartz, Tong Zhang. 2012
-//
-// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
-//
-// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
-// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
-// Peter Richtarik, Martin Takac. 2015
-//
-// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
-// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
-//
-// Arguments:
-//	sparse_example_indices: a list of vectors which contain example indices.
-//	sparse_feature_indices: a list of vectors which contain feature indices.
-//	sparse_feature_values: a list of vectors which contains feature value
-// associated with each feature group.
-//	dense_features: a list of matrices which contains the dense feature values.
-//	example_weights: a vector which contains the weight associated with each
-// example.
-//	example_labels: a vector which contains the label/target associated with each
-// example.
-//	sparse_indices: a list of vectors where each value is the indices which has
-// corresponding weights in sparse_weights. This field maybe omitted for the
-// dense approach.
-//	sparse_weights: a list of vectors where each value is the weight associated with
-// a sparse feature group.
-//	dense_weights: a list of vectors where the values are the weights associated
-// with a dense feature group.
-//	example_state_data: a list of vectors containing the example state data.
-//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
-// squared and hinge losses.
-//	l1: Symmetric l1 regularization strength.
-//	l2: Symmetric l2 regularization strength.
-//	num_loss_partitions: Number of partitions of the global loss function.
-//	num_inner_iterations: Number of iterations per mini-batch.
-//
-// Returns a list of vectors containing the updated example state
-// data.a list of vectors where each value is the delta
-// weights associated with a sparse feature group.a list of vectors where the values are the delta
-// weights associated with a dense feature group.
-func SdcaOptimizer(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerAttr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SdcaOptimizer",
-		Input: []tf.Input{
-			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	out_example_state_data = op.Output(idx)
-	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizer", err)
-		return
-	}
-	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizer", err)
-		return
-	}
-	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
-}
-
-// EncodeBase64Attr is an optional argument to EncodeBase64.
-type EncodeBase64Attr func(optionalAttr)
-
-// EncodeBase64Pad sets the optional pad attribute to value.
-//
-// value: Bool whether padding is applied at the ends.
-// If not specified, defaults to false
-func EncodeBase64Pad(value bool) EncodeBase64Attr {
-	return func(m optionalAttr) {
-		m["pad"] = value
-	}
-}
-
-// Encode strings into web-safe base64 format.
-//
-// Refer to the following article for more information on base64 format:
-// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
-// end so that the encoded has length multiple of 4. See Padding section of the
-// link above.
-//
-// Web-safe means that the encoder uses - and _ instead of + and /.
-//
-// Arguments:
-//	input: Strings to be encoded.
-//
-// Returns Input strings encoded in base64.
-func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EncodeBase64",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AvgPool3DAttr is an optional argument to AvgPool3D.
-type AvgPool3DAttr func(optionalAttr)
-
-// AvgPool3DDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func AvgPool3DDataFormat(value string) AvgPool3DAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs 3D average pooling on the input.
-//
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The average pooled output tensor.
-func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AvgPool3D",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
-type SparseToSparseSetOperationAttr func(optionalAttr)
-
-// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Applies set operation along last dimension of 2 `SparseTensor` inputs.
-//
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
-//
-// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
-// order and range of `set1` and `set2` indices.
-//
-// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
-// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
-//
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
-//
-// If `validate_indices` is `True`, this op validates the order and range of `set1`
-// and `set2` indices.
-//
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
-//
-// Arguments:
-//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
-// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
-// max set size across `0...n-1` dimensions.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
-// max set size across `0...n-1` dimensions.
-//
-//
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseToSparseSetOperation",
-		Input: []tf.Input{
-			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// UnicodeDecodeWithOffsetsAttr is an optional argument to UnicodeDecodeWithOffsets.
-type UnicodeDecodeWithOffsetsAttr func(optionalAttr)
-
-// UnicodeDecodeWithOffsetsErrors sets the optional errors attribute to value.
-//
-// value: Error handling policy when there is invalid formatting found in the input.
-// The value of 'strict' will cause the operation to produce a InvalidArgument
-// error on any invalid input formatting. A value of 'replace' (the default) will
-// cause the operation to replace any invalid formatting in the input with the
-// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
-// skip any invalid formatting in the input and produce no corresponding output
-// character.
-// If not specified, defaults to "replace"
-func UnicodeDecodeWithOffsetsErrors(value string) UnicodeDecodeWithOffsetsAttr {
-	return func(m optionalAttr) {
-		m["errors"] = value
-	}
-}
-
-// UnicodeDecodeWithOffsetsReplacementChar sets the optional replacement_char attribute to value.
-//
-// value: The replacement character codepoint to be used in place of any invalid
-// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
-// be used. The default value is the default unicode replacement character is
-// 0xFFFD or U+65533.)
-// If not specified, defaults to 65533
-func UnicodeDecodeWithOffsetsReplacementChar(value int64) UnicodeDecodeWithOffsetsAttr {
-	return func(m optionalAttr) {
-		m["replacement_char"] = value
-	}
-}
-
-// UnicodeDecodeWithOffsetsReplaceControlCharacters sets the optional replace_control_characters attribute to value.
-//
-// value: Whether to replace the C0 control characters (00-1F) with the
-// `replacement_char`. Default is false.
-// If not specified, defaults to false
-func UnicodeDecodeWithOffsetsReplaceControlCharacters(value bool) UnicodeDecodeWithOffsetsAttr {
-	return func(m optionalAttr) {
-		m["replace_control_characters"] = value
-	}
-}
-
-// UnicodeDecodeWithOffsetsTsplits sets the optional Tsplits attribute to value.
-// If not specified, defaults to DT_INT64
-func UnicodeDecodeWithOffsetsTsplits(value tf.DataType) UnicodeDecodeWithOffsetsAttr {
-	return func(m optionalAttr) {
-		m["Tsplits"] = value
-	}
-}
-
-// Decodes each string in `input` into a sequence of Unicode code points.
-//
-// The character codepoints for all strings are returned using a single vector
-// `char_values`, with strings expanded to characters in row-major order.
-// Similarly, the character start byte offsets are returned using a single vector
-// `char_to_byte_starts`, with strings expanded in row-major order.
-//
-// The `row_splits` tensor indicates where the codepoints and start offsets for
-// each input string begin and end within the `char_values` and
-// `char_to_byte_starts` tensors.  In particular, the values for the `i`th
-// string (in row-major order) are stored in the slice
-// `[row_splits[i]:row_splits[i+1]]`. Thus:
-//
-// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
-//   character in the `i`th string (in row-major order).
-// * `char_to_bytes_starts[row_splits[i]+j]` is the start byte offset for the `j`th
-//   character in the `i`th string (in row-major order).
-// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
-//   string (in row-major order).
-//
-// Arguments:
-//	input: The text to be decoded. Can have any shape. Note that the output is flattened
-// to a vector of char values.
-//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
-// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
-//
-// Returns A 1D int32 tensor containing the row splits.A 1D int32 Tensor containing the decoded codepoints.A 1D int32 Tensor containing the byte index in the input string where each
-// character in `char_values` starts.
-func UnicodeDecodeWithOffsets(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeWithOffsetsAttr) (row_splits tf.Output, char_values tf.Output, char_to_byte_starts tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"input_encoding": input_encoding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UnicodeDecodeWithOffsets",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// SparseReduceSumSparseAttr is an optional argument to SparseReduceSumSparse.
-type SparseReduceSumSparseAttr func(optionalAttr)
-
-// SparseReduceSumSparseKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceSumSparseKeepDims(value bool) SparseReduceSumSparseAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the sum of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
-// SparseTensor.
-//
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
-//
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseReduceSumSparse",
-		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Inverse 3D fast Fourier transform.
-//
-// Computes the inverse 3-dimensional discrete Fourier transform over the
-// inner-most 3 dimensions of `input`.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
-//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.ifftn with 3 dimensions.
-// @end_compatibility
-func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IFFT3D",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FakeQuantWithMinMaxVarsPerChannelGradientAttr is an optional argument to FakeQuantWithMinMaxVarsPerChannelGradient.
-type FakeQuantWithMinMaxVarsPerChannelGradientAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxVarsPerChannelGradientNumBits sets the optional num_bits attribute to value.
-//
-// value: The bitwidth of the quantization; between 2 and 16, inclusive.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxVarsPerChannelGradientNumBits(value int64) FakeQuantWithMinMaxVarsPerChannelGradientAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// FakeQuantWithMinMaxVarsPerChannelGradientNarrowRange sets the optional narrow_range attribute to value.
-//
-// value: Whether to quantize into 2^num_bits - 1 distinct values.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxVarsPerChannelGradientNarrowRange(value bool) FakeQuantWithMinMaxVarsPerChannelGradientAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
-	}
-}
-
-// Compute gradients for a FakeQuantWithMinMaxVarsPerChannel operation.
-//
-// Arguments:
-//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation,
-// shape one of: `[d]`, `[b, d]`,  `[b, h, w, d]`.
-//	inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation, shape
-//   same as `gradients`.
-// min, max: Quantization interval, floats of shape `[d]`.
-//
-//
-//
-// Returns Backpropagated gradients w.r.t. inputs, shape same as
-// `inputs`:
-//   `gradients * (inputs >= min && inputs <= max)`.Backpropagated gradients w.r.t. min parameter, shape `[d]`:
-// `sum_per_d(gradients * (inputs < min))`.Backpropagated gradients w.r.t. max parameter, shape `[d]`:
-// `sum_per_d(gradients * (inputs > max))`.
-func FakeQuantWithMinMaxVarsPerChannelGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsPerChannelGradientAttr) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVarsPerChannelGradient",
-		Input: []tf.Input{
-			gradients, inputs, min, max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns the element-wise min of two SparseTensors.
-//
-// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
-//
-// Arguments:
-//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, in the canonical lexicographic ordering.
-//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
-//	a_shape: 1-D.  Shape of the input SparseTensor.
-//	b_indices: counterpart to `a_indices` for the other operand.
-//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
-//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
-//
-// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
-func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSparseMinimum",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Returns the element-wise max of two SparseTensors.
-//
-// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
-//
-// Arguments:
-//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, in the canonical lexicographic ordering.
-//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
-//	a_shape: 1-D.  Shape of the input SparseTensor.
-//	b_indices: counterpart to `a_indices` for the other operand.
-//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
-//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
-//
-// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
-func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSparseMaximum",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Adds up a SparseTensor and a dense Tensor, using these special rules:
-//
-// (1) Broadcasts the dense side to have the same shape as the sparse side, if
-//     eligible;
-// (2) Then, only the dense values pointed to by the indices of the SparseTensor
-//     participate in the cwise addition.
-//
-// By these rules, the result is a logical SparseTensor with exactly the same
-// indices and shape, but possibly with different non-zero values.  The output of
-// this Op is the resultant non-zero values.
-//
-// Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
-//
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseAdd",
-		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns 0 if x == 0, and x * log(y) otherwise, elementwise.
-func Xlogy(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Xlogy",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Component-wise divides a SparseTensor by a dense Tensor.
-//
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
-//
-// Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
-//
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseDiv",
-		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Component-wise multiplies a SparseTensor by a dense Tensor.
-//
-// The output locations corresponding to the implicitly zero elements in the sparse
-// tensor will be zero (i.e., will not take up storage space), regardless of the
-// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
-//
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
-//
-// Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
-//
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseMul",
-		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Splits a tensor into `num_split` tensors along one dimension.
-//
-// Arguments:
-//	axis: 0-D.  The dimension along which to split.  Must be in the range
-// `[-rank(value), rank(value))`.
-//	value: The tensor to split.
-//	num_split: The number of ways to split.  Must evenly divide
-// `value.shape[split_dim]`.
-//
-// Returns They are identically shaped tensors, whose shape matches that of `value`
-// except along `axis`, where their sizes are
-// `values.shape[split_dim] / num_split`.
-func Split(scope *Scope, axis tf.Output, value tf.Output, num_split int64) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_split": num_split}
-	opspec := tf.OpSpec{
-		Type: "Split",
-		Input: []tf.Input{
-			axis, value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("Split", err)
-		return
-	}
-	return output
-}
-
-// Conv3DBackpropInputAttr is an optional argument to Conv3DBackpropInput.
-type Conv3DBackpropInputAttr func(optionalAttr)
-
-// Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of 3-D convolution with respect to the input.
-//
-// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
-//
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInput",
-		Input: []tf.Input{
-			input, filter, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset containing elements of first component of `input_dataset` having true in the last component.
-func FilterByLastComponentDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "FilterByLastComponentDataset",
-		Input: []tf.Input{
-			input_dataset,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates an Optional variant with no value.
-func OptionalNone(scope *Scope) (optional tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "OptionalNone",
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FIFOQueueV2Attr is an optional argument to FIFOQueueV2.
-type FIFOQueueV2Attr func(optionalAttr)
-
-// FIFOQueueV2Shapes sets the optional shapes attribute to value.
-//
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func FIFOQueueV2Shapes(value []tf.Shape) FIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shapes"] = value
-	}
-}
-
-// FIFOQueueV2Capacity sets the optional capacity attribute to value.
-//
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func FIFOQueueV2Capacity(value int64) FIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// FIFOQueueV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func FIFOQueueV2Container(value string) FIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// FIFOQueueV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func FIFOQueueV2SharedName(value string) FIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A queue that produces elements in first-in first-out order.
-//
-// Arguments:
-//	component_types: The type of each component in a value.
-//
-// Returns The handle to the queue.
-func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQueueV2Attr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"component_types": component_types}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FIFOQueueV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Eagerly executes a python function to compute func(input)->output. The
-//
-// semantics of the input, output, and attributes are the same as those for
-// PyFunc.
-func EagerPyFunc(scope *Scope, input []tf.Output, token string, Tout []tf.DataType) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"token": token, "Tout": Tout}
-	opspec := tf.OpSpec{
-		Type: "EagerPyFunc",
-		Input: []tf.Input{
-			tf.OutputList(input),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("EagerPyFunc", err)
-		return
-	}
-	return output
-}
-
-// Deprecated. Use TensorArraySizeV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArraySizeV3
-func TensorArraySizeV2(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArraySizeV2",
-		Input: []tf.Input{
-			handle, flow_in,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Reorders a SparseTensor into the canonical, row-major ordering.
-//
-// Note that by convention, all sparse ops preserve the canonical ordering along
-// increasing dimension number. The only time ordering can be violated is during
-// manual manipulation of the indices and values vectors to add entries.
-//
-// Reordering does not affect the shape of the SparseTensor.
-//
-// If the tensor has rank `R` and `N` non-empty values, `input_indices` has
-// shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//
-// Returns 2-D.  `N x R` matrix with the same indices as input_indices, but
-// in canonical row-major ordering.1-D.  `N` non-empty values corresponding to `output_indices`.
-func SparseReorder(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseReorder",
-		Input: []tf.Input{
-			input_indices, input_values, input_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
 // DenseToDenseSetOperationAttr is an optional argument to DenseToDenseSetOperation.
 type DenseToDenseSetOperationAttr func(optionalAttr)
 
@@ -14240,52 +18292,46 @@ func DenseToDenseSetOperation(scope *Scope, set1 tf.Output, set2 tf.Output, set_
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
-type ResourceApplyAdamAttr func(optionalAttr)
+// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
+type ResourceApplyRMSPropAttr func(optionalAttr)
 
-// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var, m, and v tensors will be protected
+// value: If `True`, updating of the var, ms, and mom tensors is protected
 // by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
+func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
+// Update '*var' according to the RMSProp algorithm.
 //
-// value: If `True`, uses the nesterov update.
-// If not specified, defaults to false
-func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update '*var' according to the Adam algorithm.
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
 //
-// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
-// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
-// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-// $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	beta1_power: Must be a scalar.
-//	beta2_power: Must be a scalar.
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
 //	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
 //	epsilon: Ridge term. Must be a scalar.
 //	grad: The gradient.
 //
 // Returns the created operation.
-func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
+func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14294,24 +18340,264 @@ func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, b
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdam",
+		Type: "ResourceApplyRMSProp",
 		Input: []tf.Input{
-			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
+			var_, ms, mom, lr, rho, momentum, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Computes exponential of x - 1 element-wise.
+// ResourceApplyCenteredRMSPropAttr is an optional argument to ResourceApplyCenteredRMSProp.
+type ResourceApplyCenteredRMSPropAttr func(optionalAttr)
+
+// ResourceApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// I.e., \\(y = (\exp x) - 1\\).
-func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the centered RMSProp algorithm.
+//
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
+//
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// mean_grad = decay * mean_grad + (1-decay) * gradient
+//
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+//
+// mg <- rho * mg_{t-1} + (1-rho) * grad
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	mg: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyCenteredRMSProp",
+		Input: []tf.Input{
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// StatelessMultinomialAttr is an optional argument to StatelessMultinomial.
+type StatelessMultinomialAttr func(optionalAttr)
+
+// StatelessMultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func StatelessMultinomialOutputDtype(value tf.DataType) StatelessMultinomialAttr {
+	return func(m optionalAttr) {
+		m["output_dtype"] = value
+	}
+}
+
+// Draws samples from a multinomial distribution.
+//
+// Arguments:
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func StatelessMultinomial(scope *Scope, logits tf.Output, num_samples tf.Output, seed tf.Output, optional ...StatelessMultinomialAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessMultinomial",
+		Input: []tf.Input{
+			logits, num_samples, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StatefulUniformFullIntAttr is an optional argument to StatefulUniformFullInt.
+type StatefulUniformFullIntAttr func(optionalAttr)
+
+// StatefulUniformFullIntDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_UINT64
+func StatefulUniformFullIntDtype(value tf.DataType) StatefulUniformFullIntAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs random integers from a uniform distribution.
+//
+// The generated values are uniform integers covering the whole range of `dtype`.
+//
+// Arguments:
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	algorithm: The RNG algorithm.
+//	shape: The shape of the output tensor.
+//
+// Returns Random values with specified shape.
+func StatefulUniformFullInt(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulUniformFullIntAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatefulUniformFullInt",
+		Input: []tf.Input{
+			resource, algorithm, shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RestoreAttr is an optional argument to Restore.
+type RestoreAttr func(optionalAttr)
+
+// RestorePreferredShard sets the optional preferred_shard attribute to value.
+//
+// value: Index of file to open first if multiple files match
+// `file_pattern`.
+// If not specified, defaults to -1
+func RestorePreferredShard(value int64) RestoreAttr {
+	return func(m optionalAttr) {
+		m["preferred_shard"] = value
+	}
+}
+
+// Restores a tensor from checkpoint files.
+//
+// Reads a tensor stored in one or several files. If there are several files (for
+// instance because a tensor was saved as slices), `file_pattern` may contain
+// wildcard symbols (`*` and `?`) in the filename portion only, not in the
+// directory portion.
+//
+// If a `file_pattern` matches several files, `preferred_shard` can be used to hint
+// in which file the requested tensor is likely to be found. This op will first
+// open the file at index `preferred_shard` in the list of matching files and try
+// to restore tensors from that file.  Only if some tensors or tensor slices are
+// not found in that first file, then the Op opens all the files. Setting
+// `preferred_shard` to match the value passed as the `shard` input
+// of a matching `Save` Op may speed up Restore.  This attribute only affects
+// performance, not correctness.  The default value -1 means files are processed in
+// order.
+//
+// See also `RestoreSlice`.
+//
+// Arguments:
+//	file_pattern: Must have a single element. The pattern of the files from
+// which we read the tensor.
+//	tensor_name: Must have a single element. The name of the tensor to be
+// restored.
+//	dt: The type of the tensor to be restored.
+//
+// Returns The restored tensor.
+func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.DataType, optional ...RestoreAttr) (tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dt": dt}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Restore",
+		Input: []tf.Input{
+			file_pattern, tensor_name,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Gets the next output from the given iterator as an Optional variant.
+func IteratorGetNextAsOptional(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (optional tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "IteratorGetNextAsOptional",
+		Input: []tf.Input{
+			iterator,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the trignometric inverse sine of x element-wise.
+//
+// The `tf.math.asin` operation returns the inverse of `tf.math.sin`, such that
+// if `y = tf.math.sin(x)` then, `x = tf.math.asin(y)`.
+//
+// **Note**: The output of `tf.math.asin` will lie within the invertible range
+// of sine, i.e [-pi/2, pi/2].
+//
+// For example:
+//
+// ```python
+// # Note: [1.047, 0.785] ~= [(pi/3), (pi/4)]
+// x = tf.constant([1.047, 0.785])
+// y = tf.math.sin(x) # [0.8659266, 0.7068252]
+//
+// tf.math.asin(y) # [1.047, 0.785] = x
+// ```
+//
+func Asin(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Expm1",
+		Type: "Asin",
 		Input: []tf.Input{
 			x,
 		},
@@ -14320,201 +18606,211 @@ func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Conv2DAttr is an optional argument to Conv2D.
-type Conv2DAttr func(optionalAttr)
-
-// Conv2DUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DUseCudnnOnGpu(value bool) Conv2DAttr {
-	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
-	}
-}
-
-// Conv2DExplicitPaddings sets the optional explicit_paddings attribute to value.
+// Returns the next record (key, value pair) produced by a Reader.
 //
-// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
-// dimension, the amount of padding inserted before and after the dimension is
-// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
-// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
-// If not specified, defaults to <>
-func Conv2DExplicitPaddings(value []int64) Conv2DAttr {
-	return func(m optionalAttr) {
-		m["explicit_paddings"] = value
-	}
-}
-
-// Conv2DDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func Conv2DDataFormat(value string) Conv2DAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv2DDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func Conv2DDilations(value []int64) Conv2DAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes a 2-D convolution given 4-D `input` and `filter` tensors.
-//
-// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-// and a filter / kernel tensor of shape
-// `[filter_height, filter_width, in_channels, out_channels]`, this op
-// performs the following:
-//
-// 1. Flattens the filter to a 2-D matrix with shape
-//    `[filter_height * filter_width * in_channels, output_channels]`.
-// 2. Extracts image patches from the input tensor to form a *virtual*
-//    tensor of shape `[batch, out_height, out_width,
-//    filter_height * filter_width * in_channels]`.
-// 3. For each patch, right-multiplies the filter matrix and the image patch
-//    vector.
-//
-// In detail, with the default NHWC format,
-//
-//     output[b, i, j, k] =
-//         sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
-//                         filter[di, dj, q, k]
-//
-// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
 //
 // Arguments:
-//	input: A 4-D tensor. The dimension order is interpreted according to the value
-// of `data_format`, see below for details.
-//	filter: A 4-D tensor of shape
-// `[filter_height, filter_width, in_channels, out_channels]`
-//	strides: 1-D tensor of length 4.  The stride of the sliding window for each
-// dimension of `input`. The dimension order is determined by the value of
-// `data_format`, see below for details.
-//	padding: The type of padding algorithm to use.
+//	reader_handle: Handle to a Reader.
+//	queue_handle: Handle to a Queue, with string work items.
 //
-// Returns A 4-D tensor. The dimension order is determined by the value of
-// `data_format`, see below for details.
-func Conv2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv2DAttr) (output tf.Output) {
+// Returns A scalar.A scalar.
+func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "ReaderReadV2",
+		Input: []tf.Input{
+			reader_handle, queue_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Enqueue multiple Tensor values on the computation outfeed.
+//
+// Arguments:
+//	inputs: A list of tensors that will be inserted into the outfeed queue as an
+// XLA tuple.
+//
+// Returns the created operation.
+func OutfeedEnqueueTuple(scope *Scope, inputs []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "OutfeedEnqueueTuple",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns a batched matrix tensor with new batched diagonal values.
+//
+// Given `input` and `diagonal`, this operation returns a tensor with the
+// same shape and values as `input`, except for the main diagonal of the
+// innermost matrices.  These will be overwritten by the values in `diagonal`.
+//
+// The output is computed as follows:
+//
+// Assume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has
+// `k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a
+// tensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:
+//
+//   * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.
+//   * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`.
+//
+// Arguments:
+//	input: Rank `k+1`, where `k >= 1`.
+//	diagonal: Rank `k`, where `k >= 1`.
+//
+// Returns Rank `k+1`, with `output.shape = input.shape`.
+func MatrixSetDiag(scope *Scope, input tf.Output, diagonal tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixSetDiag",
+		Input: []tf.Input{
+			input, diagonal,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Inverse 2D real-valued fast Fourier transform.
+//
+// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 2 dimensions of `input`.
+//
+// The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 2 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along each axis `IRFFT2D` is computed on, if `fft_length` (or
+// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+//
+// Returns A float32 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft2
+// @end_compatibility
+func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IRFFT2D",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyProximalAdagradAttr is an optional argument to ResourceApplyProximalAdagrad.
+type ResourceApplyProximalAdagradAttr func(optionalAttr)
+
+// ResourceApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyProximalAdagradUseLocking(value bool) ResourceApplyProximalAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
+//
+// accum += grad * grad
+// prox_v = var - lr * grad * (1 / sqrt(accum))
+// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, optional ...ResourceApplyProximalAdagradAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv2D",
+		Type: "ResourceApplyProximalAdagrad",
 		Input: []tf.Input{
-			input, filter,
+			var_, accum, lr, l1, l2, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// An op enabling differentiation of TPU Embeddings.
-//
-// This op simply returns its first input, which is assumed to have been sliced
-// from the Tensors returned by TPUEmbeddingDequeueActivations. The presence of
-// this op, and its first argument being a trainable Variable, enables automatic
-// differentiation of graphs containing embeddings via the TPU Embedding Python
-// libraries.
+// QuantizedAddAttr is an optional argument to QuantizedAdd.
+type QuantizedAddAttr func(optionalAttr)
+
+// QuantizedAddToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedAddToutput(value tf.DataType) QuantizedAddAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// Returns x + y element-wise, working on quantized buffers.
 //
 // Arguments:
-//	embedding_variable: A trainable variable, enabling optimizers to find this op.
-//	sliced_activations: The embedding activations Tensor to return.
-//	table_id: The id of the table in the embedding layer configuration from which
-// these activations were computed.
-//	lookup_id: Identifier of the set of embedding indices which produced these
-// activations.
-func TPUEmbeddingActivations(scope *Scope, embedding_variable tf.Output, sliced_activations tf.Output, table_id int64, lookup_id int64) (output tf.Output) {
+//
+//
+//	min_x: The float value that the lowest quantized `x` value represents.
+//	max_x: The float value that the highest quantized `x` value represents.
+//	min_y: The float value that the lowest quantized `y` value represents.
+//	max_y: The float value that the highest quantized `y` value represents.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+//
+// *NOTE*: `QuantizedAdd` supports limited forms of broadcasting. More about
+// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func QuantizedAdd(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedAddAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"table_id": table_id, "lookup_id": lookup_id}
-	opspec := tf.OpSpec{
-		Type: "TPUEmbeddingActivations",
-		Input: []tf.Input{
-			embedding_variable, sliced_activations,
-		},
-		Attrs: attrs,
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Deserialize `SparseTensor` objects.
-//
-// The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
-// the last dimension stores serialized `SparseTensor` objects and the other N
-// dimensions (N >= 0) correspond to a batch. The ranks of the original
-// `SparseTensor` objects must all match. When the final `SparseTensor` is
-// created, its rank is the rank of the incoming `SparseTensor` objects plus N;
-// the sparse tensors have been concatenated along new dimensions, one for each
-// batch.
-//
-// The output `SparseTensor` object's shape values for the original dimensions
-// are the max across the input `SparseTensor` objects' shape values for the
-// corresponding dimensions. The new dimensions match the size of the batch.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the serialized input is a `[2 x 3]` matrix representing two
-// original `SparseTensor` objects:
-//
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-//
-// and
-//
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
-//
-// then the final deserialized `SparseTensor` will be:
-//
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
-//
-// Arguments:
-//	serialized_sparse: The serialized `SparseTensor` objects. The last dimension
-// must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` objects.
-func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "DeserializeSparse",
+		Type: "QuantizedAdd",
 		Input: []tf.Input{
-			serialized_sparse,
+			x, y, min_x, max_x, min_y, max_y,
 		},
 		Attrs: attrs,
 	}
@@ -14522,36 +18818,309 @@ func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataT
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr is an optional argument to RetrieveTPUEmbeddingStochasticGradientDescentParameters.
-type RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr func(optionalAttr)
+// Produces a summary of any statistics recorded by the given statistics manager.
+func ExperimentalStatsAggregatorSummary(scope *Scope, iterator tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalStatsAggregatorSummary",
+		Input: []tf.Input{
+			iterator,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// RetrieveTPUEmbeddingStochasticGradientDescentParametersTableId sets the optional table_id attribute to value.
+// Computes softsign gradients for a softsign operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding softsign operation.
+//	features: The features passed as input to the corresponding softsign operation.
+//
+// Returns The gradients: `gradients / (1 + abs(features)) ** 2`.
+func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SoftsignGrad",
+		Input: []tf.Input{
+			gradients, features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FractionalAvgPoolAttr is an optional argument to FractionalAvgPool.
+type FractionalAvgPoolAttr func(optionalAttr)
+
+// FractionalAvgPoolPseudoRandom sets the optional pseudo_random attribute to value.
+//
+// value: When set to True, generates the pooling sequence in a
+// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
+// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
+// difference between pseudorandom and random.
+// If not specified, defaults to false
+func FractionalAvgPoolPseudoRandom(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["pseudo_random"] = value
+	}
+}
+
+// FractionalAvgPoolOverlapping sets the optional overlapping attribute to value.
+//
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [41/3, 26/3] for fractional avg pooling.
+// If not specified, defaults to false
+func FractionalAvgPoolOverlapping(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// FractionalAvgPoolDeterministic sets the optional deterministic attribute to value.
+//
+// value: When set to True, a fixed pooling region will be used when
+// iterating over a FractionalAvgPool node in the computation graph. Mainly used
+// in unit test to make FractionalAvgPool deterministic.
+// If not specified, defaults to false
+func FractionalAvgPoolDeterministic(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["deterministic"] = value
+	}
+}
+
+// FractionalAvgPoolSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FractionalAvgPoolSeed(value int64) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// FractionalAvgPoolSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FractionalAvgPoolSeed2(value int64) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Performs fractional average pooling on the input.
+//
+// Fractional average pooling is similar to Fractional max pooling in the pooling
+// region generation step. The only difference is that after pooling regions are
+// generated, a mean operation is performed instead of a max operation in each
+// pooling region.
+//
+// Arguments:
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
+// supports row and col dimension and should be >= 1.0. For example, a valid
+// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
+// must be 1.0 because we don't allow pooling on batch and channels
+// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
+// respectively.
+//
+// Returns output tensor after fractional avg pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
+func FractionalAvgPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalAvgPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FractionalAvgPool",
+		Input: []tf.Input{
+			value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// PaddingFIFOQueueV2Attr is an optional argument to PaddingFIFOQueueV2.
+type PaddingFIFOQueueV2Attr func(optionalAttr)
+
+// PaddingFIFOQueueV2Shapes sets the optional shapes attribute to value.
+//
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types.
+// Shapes of fixed rank but variable size are allowed by setting
+// any shape dimension to -1.  In this case, the inputs' shape may vary along
+// the given dimension, and DequeueMany will pad the given dimension with
+// zeros up to the maximum shape of all elements in the given batch.
+// If the length of this attr is 0, different queue elements may have
+// different ranks and shapes, but only one element may be dequeued at a time.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func PaddingFIFOQueueV2Shapes(value []tf.Shape) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shapes"] = value
+	}
+}
+
+// PaddingFIFOQueueV2Capacity sets the optional capacity attribute to value.
+//
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func PaddingFIFOQueueV2Capacity(value int64) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// PaddingFIFOQueueV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func PaddingFIFOQueueV2Container(value string) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// PaddingFIFOQueueV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func PaddingFIFOQueueV2SharedName(value string) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A queue that produces elements in first-in first-out order.
+//
+// Variable-size shapes are allowed by setting the corresponding shape dimensions
+// to 0 in the shape attr.  In this case DequeueMany will pad up to the maximum
+// size of any given element in the minibatch.  See below for details.
+//
+// Arguments:
+//	component_types: The type of each component in a value.
+//
+// Returns The handle to the queue.
+func PaddingFIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...PaddingFIFOQueueV2Attr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "PaddingFIFOQueueV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StatefulStandardNormalV2Attr is an optional argument to StatefulStandardNormalV2.
+type StatefulStandardNormalV2Attr func(optionalAttr)
+
+// StatefulStandardNormalV2Dtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatefulStandardNormalV2Dtype(value tf.DataType) StatefulStandardNormalV2Attr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs random values from a normal distribution.
+//
+// The generated values will have mean 0 and standard deviation 1.
+//
+// Arguments:
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	algorithm: The RNG algorithm.
+//	shape: The shape of the output tensor.
+//
+// Returns A tensor of the specified shape filled with random normal values.
+func StatefulStandardNormalV2(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulStandardNormalV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatefulStandardNormalV2",
+		Input: []tf.Input{
+			resource, algorithm, shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingCenteredRMSPropParametersAttr is an optional argument to LoadTPUEmbeddingCenteredRMSPropParameters.
+type LoadTPUEmbeddingCenteredRMSPropParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingCenteredRMSPropParametersTableId sets the optional table_id attribute to value.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
-func RetrieveTPUEmbeddingStochasticGradientDescentParametersTableId(value int64) RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr {
+func LoadTPUEmbeddingCenteredRMSPropParametersTableId(value int64) LoadTPUEmbeddingCenteredRMSPropParametersAttr {
 	return func(m optionalAttr) {
 		m["table_id"] = value
 	}
 }
 
-// RetrieveTPUEmbeddingStochasticGradientDescentParametersTableName sets the optional table_name attribute to value.
+// LoadTPUEmbeddingCenteredRMSPropParametersTableName sets the optional table_name attribute to value.
 // If not specified, defaults to ""
-func RetrieveTPUEmbeddingStochasticGradientDescentParametersTableName(value string) RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr {
+func LoadTPUEmbeddingCenteredRMSPropParametersTableName(value string) LoadTPUEmbeddingCenteredRMSPropParametersAttr {
 	return func(m optionalAttr) {
 		m["table_name"] = value
 	}
 }
 
-// Retrieve SGD embedding parameters.
+// Load centered RMSProp embedding parameters.
 //
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
-// Returns Parameter parameters updated by the stochastic gradient descent optimization algorithm.
-func RetrieveTPUEmbeddingStochasticGradientDescentParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr) (parameters tf.Output) {
+// Arguments:
+//	parameters: Value of parameters used in the centered RMSProp optimization algorithm.
+//	ms: Value of ms used in the centered RMSProp optimization algorithm.
+//	mom: Value of mom used in the centered RMSProp optimization algorithm.
+//	mg: Value of mg used in the centered RMSProp optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingCenteredRMSPropParameters(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, mg tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingCenteredRMSPropParametersAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14560,297 +19129,213 @@ func RetrieveTPUEmbeddingStochasticGradientDescentParameters(scope *Scope, num_s
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingStochasticGradientDescentParameters",
+		Type: "LoadTPUEmbeddingCenteredRMSPropParameters",
+		Input: []tf.Input{
+			parameters, ms, mom, mg,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
 
+// DequantizeAttr is an optional argument to Dequantize.
+type DequantizeAttr func(optionalAttr)
+
+// DequantizeMode sets the optional mode attribute to value.
+// If not specified, defaults to "MIN_COMBINED"
+func DequantizeMode(value string) DequantizeAttr {
+	return func(m optionalAttr) {
+		m["mode"] = value
+	}
+}
+
+// Dequantize the 'input' tensor into a float Tensor.
+//
+// [min_range, max_range] are scalar floats that specify the range for
+// the 'input' data. The 'mode' attribute controls exactly which calculations are
+// used to convert the float values to their quantized equivalents.
+//
+// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
+//
+// ```
+// if T == qint8: in[i] += (range(T) + 1)/ 2.0
+// out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
+// ```
+// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
+//
+// *MIN_COMBINED Mode Example*
+//
+// If the input comes from a QuantizedRelu6, the output type is
+// quint8 (range of 0-255) but the possible range of QuantizedRelu6 is
+// 0-6.  The min_range and max_range values are therefore 0.0 and 6.0.
+// Dequantize on quint8 will take each value, cast to float, and multiply
+// by 6 / 255.
+// Note that if quantizedtype is qint8, the operation will additionally add
+// each value by 128 prior to casting.
+//
+// If the mode is 'MIN_FIRST', then this approach is used:
+//
+// ```c++
+// num_discrete_values = 1 << (# of bits in T)
+// range_adjust = num_discrete_values / (num_discrete_values - 1)
+// range = (range_max - range_min) * range_adjust
+// range_scale = range / num_discrete_values
+// const double offset_input = static_cast<double>(input) - lowest_quantized;
+// result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
+// ```
+//
+// *SCALED mode Example*
+//
+// `SCALED` mode matches the quantization approach used in
+// `QuantizeAndDequantize{V2|V3}`.
+//
+// If the mode is `SCALED`, we do not use the full range of the output type,
+// choosing to elide the lowest possible value for symmetry (e.g., output range is
+// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
+// 0.
+//
+// We first find the range of values in our tensor. The
+// range we use is always centered on 0, so we find m such that
+// ```c++
+//   m = max(abs(input_min), abs(input_max))
+// ```
+//
+// Our input tensor range is then `[-m, m]`.
+//
+// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
+// If T is signed, this is
+// ```
+//   num_bits = sizeof(T) * 8
+//   [min_fixed, max_fixed] =
+//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
+// ```
+//
+// Otherwise, if T is unsigned, the fixed-point range is
+// ```
+//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
+// ```
+//
+// From this we compute our scaling factor, s:
+// ```c++
+//   s = (2 * m) / (max_fixed - min_fixed)
+// ```
+//
+// Now we can dequantize the elements of our tensor:
+// ```c++
+// result = input * s
+// ```
+//
+// Arguments:
+//
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
+func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, optional ...DequantizeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Dequantize",
+		Input: []tf.Input{
+			input, min_range, max_range,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizedConv2DPerChannelAttr is an optional argument to QuantizedConv2DPerChannel.
-type QuantizedConv2DPerChannelAttr func(optionalAttr)
+// RetrieveTPUEmbeddingAdadeltaParametersAttr is an optional argument to RetrieveTPUEmbeddingAdadeltaParameters.
+type RetrieveTPUEmbeddingAdadeltaParametersAttr func(optionalAttr)
 
-// QuantizedConv2DPerChannelOutType sets the optional out_type attribute to value.
+// RetrieveTPUEmbeddingAdadeltaParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: The quantized type of output tensor that needs to be converted.
-// If not specified, defaults to DT_QINT32
-func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChannelAttr {
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingAdadeltaParametersTableId(value int64) RetrieveTPUEmbeddingAdadeltaParametersAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["table_id"] = value
 	}
 }
 
-// QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
-//
-// value: list of dilation values.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
+// RetrieveTPUEmbeddingAdadeltaParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingAdadeltaParametersTableName(value string) RetrieveTPUEmbeddingAdadeltaParametersAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["table_name"] = value
 	}
 }
 
-// Computes QuantizedConv2D per channel.
+// Retrieve Adadelta embedding parameters.
 //
-// Arguments:
-//	input: The original input tensor.
-//	filter: The original filter tensor.
-//	min_input: The minimum value of the input tensor
-//	max_input: The maximum value of the input tensor.
-//	min_filter: The minimum value of the filter tensor.
-//	max_filter: The maximum value of the filter tensor.
-//	strides: list of stride values.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-//
-// Returns The output tensor.The minimum value of the final output tensor.The maximum value of the final output tensor.
-func QuantizedConv2DPerChannel(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DPerChannelAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+// Returns Parameter parameters updated by the Adadelta optimization algorithm.Parameter accumulators updated by the Adadelta optimization algorithm.Parameter updates updated by the Adadelta optimization algorithm.
+func RetrieveTPUEmbeddingAdadeltaParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdadeltaParametersAttr) (parameters tf.Output, accumulators tf.Output, updates tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedConv2DPerChannel",
-		Input: []tf.Input{
-			input, filter, min_input, max_input, min_filter, max_filter,
-		},
+		Type: "RetrieveTPUEmbeddingAdadeltaParameters",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// DecodePaddedRawAttr is an optional argument to DecodePaddedRaw.
-type DecodePaddedRawAttr func(optionalAttr)
-
-// DecodePaddedRawLittleEndian sets the optional little_endian attribute to value.
-//
-// value: Whether the input `input_bytes` is in little-endian order. Ignored for
-// `out_type` values that are stored in a single byte, like `uint8`
-// If not specified, defaults to true
-func DecodePaddedRawLittleEndian(value bool) DecodePaddedRawAttr {
-	return func(m optionalAttr) {
-		m["little_endian"] = value
-	}
-}
-
-// Reinterpret the bytes of a string as a vector of numbers.
+// Transforms a tf.Example proto (as a string) into typed tensors.
 //
 // Arguments:
-//	input_bytes: Tensor of string to be decoded.
-//	fixed_length: Length in bytes for each element of the decoded output. Must be a multiple
-// of the size of the output type.
-//
-//
-// Returns A Tensor with one more dimension than the input `bytes`. The added dimension
-// will have size equal to the length of the elements of `bytes` divided by the
-// number of bytes to represent `out_type`.
-func DecodePaddedRaw(scope *Scope, input_bytes tf.Output, fixed_length tf.Output, out_type tf.DataType, optional ...DecodePaddedRawAttr) (output tf.Output) {
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	dense_defaults: A list of Tensors (some may be empty), whose length matches
+// the length of `dense_keys`. dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	num_sparse: The number of sparse features to be parsed from the example. This
+// must match the lengths of `sparse_keys` and `sparse_types`.
+//	sparse_keys: A list of `num_sparse` strings.
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: The keys expected in the Examples' features associated with dense
+// values.
+//	sparse_types: A list of `num_sparse` types; the data types of data in each
+// Feature given in sparse_keys.
+// Currently the ParseSingleExample op supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: The shapes of data in each Feature given in dense_keys.
+// The length of this list must match the length of `dense_keys`.  The
+// number of elements in the Feature corresponding to dense_key[j] must
+// always equal dense_shapes[j].NumEntries().  If dense_shapes[j] ==
+// (D0, D1, ..., DN) then the shape of output Tensor dense_values[j]
+// will be (D0, D1, ..., DN): In the case dense_shapes[j] = (-1, D1,
+// ..., DN), the shape of the output Tensor dense_values[j] will be (M,
+// D1, .., DN), where M is the number of blocks of elements of length
+// D1 * .... * DN, in the input.
+func ParseSingleExample(scope *Scope, serialized tf.Output, dense_defaults []tf.Output, num_sparse int64, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"num_sparse": num_sparse, "sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes}
 	opspec := tf.OpSpec{
-		Type: "DecodePaddedRaw",
+		Type: "ParseSingleExample",
 		Input: []tf.Input{
-			input_bytes, fixed_length,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// The gradient operator for the SparseAdd op.
-//
-// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
-// as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
-// non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
-// values of A and B.
-//
-// Arguments:
-//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
-// the non-empty values of the sum.
-//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
-//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
-//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
-// `[nnz(sum), ndims]`.
-//
-// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
-// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
-// non-empty values of B.
-func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseAddGrad",
-		Input: []tf.Input{
-			backprop_val_grad, a_indices, b_indices, sum_indices,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Clips tensor values to a specified min and max.
-//
-// Given a tensor `t`, this operation returns a tensor of the same type and
-// shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
-// Any values less than `clip_value_min` are set to `clip_value_min`. Any values
-// greater than `clip_value_max` are set to `clip_value_max`.
-//
-// Arguments:
-//	t: A `Tensor`.
-//	clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
-// as `t`. The minimum value to clip by.
-//	clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
-// as `t`. The maximum value to clip by.
-//
-// Returns A clipped `Tensor` with the same shape as input 't'.
-func ClipByValue(scope *Scope, t tf.Output, clip_value_min tf.Output, clip_value_max tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ClipByValue",
-		Input: []tf.Input{
-			t, clip_value_min, clip_value_max,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
-//
-// Arguments:
-//	tag: A string attached to this summary. Used for organization in TensorBoard.
-//	tensor: A tensor to serialize.
-//	serialized_summary_metadata: A serialized SummaryMetadata proto. Contains plugin
-// data.
-func TensorSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, serialized_summary_metadata tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorSummaryV2",
-		Input: []tf.Input{
-			tag, tensor, serialized_summary_metadata,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Reads the value of a variable.
-//
-// The tensor returned by this operation is immutable.
-//
-// The value returned by this operation is guaranteed to be influenced by all the
-// writes on which this operation depends directly or indirectly, and to not be
-// influenced by any of the writes which depend directly or indirectly on this
-// operation.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	dtype: the dtype of the value.
-func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "ReadVariableOp",
-		Input: []tf.Input{
-			resource,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SdcaOptimizerV2Attr is an optional argument to SdcaOptimizerV2.
-type SdcaOptimizerV2Attr func(optionalAttr)
-
-// SdcaOptimizerV2Adaptive sets the optional adaptive attribute to value.
-//
-// value: Whether to use Adaptive SDCA for the inner loop.
-// If not specified, defaults to true
-func SdcaOptimizerV2Adaptive(value bool) SdcaOptimizerV2Attr {
-	return func(m optionalAttr) {
-		m["adaptive"] = value
-	}
-}
-
-// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
-//
-// linear models with L1 + L2 regularization. As global optimization objective is
-// strongly-convex, the optimizer optimizes the dual objective at each step. The
-// optimizer applies each update one example at a time. Examples are sampled
-// uniformly, and the optimizer is learning rate free and enjoys linear convergence
-// rate.
-//
-// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
-// Shai Shalev-Shwartz, Tong Zhang. 2012
-//
-// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
-//
-// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
-// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
-// Peter Richtarik, Martin Takac. 2015
-//
-// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
-// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
-//
-// Arguments:
-//	sparse_example_indices: a list of vectors which contain example indices.
-//	sparse_feature_indices: a list of vectors which contain feature indices.
-//	sparse_feature_values: a list of vectors which contains feature value
-// associated with each feature group.
-//	dense_features: a list of matrices which contains the dense feature values.
-//	example_weights: a vector which contains the weight associated with each
-// example.
-//	example_labels: a vector which contains the label/target associated with each
-// example.
-//	sparse_indices: a list of vectors where each value is the indices which has
-// corresponding weights in sparse_weights. This field maybe omitted for the
-// dense approach.
-//	sparse_weights: a list of vectors where each value is the weight associated with
-// a sparse feature group.
-//	dense_weights: a list of vectors where the values are the weights associated
-// with a dense feature group.
-//	example_state_data: a list of vectors containing the example state data.
-//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
-// squared and hinge losses.
-//	l1: Symmetric l1 regularization strength.
-//	l2: Symmetric l2 regularization strength.
-//	num_loss_partitions: Number of partitions of the global loss function.
-//	num_inner_iterations: Number of iterations per mini-batch.
-//
-// Returns a list of vectors containing the updated example state
-// data.a list of vectors where each value is the delta
-// weights associated with a sparse feature group.a list of vectors where the values are the delta
-// weights associated with a dense feature group.
-func SdcaOptimizerV2(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerV2Attr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SdcaOptimizerV2",
-		Input: []tf.Input{
-			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
+			serialized, tf.OutputList(dense_defaults),
 		},
 		Attrs: attrs,
 	}
@@ -14860,85 +19345,65 @@ func SdcaOptimizerV2(scope *Scope, sparse_example_indices []tf.Output, sparse_fe
 	}
 	var idx int
 	var err error
-	out_example_state_data = op.Output(idx)
-	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizerV2", err)
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
 		return
 	}
-	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizerV2", err)
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
 		return
 	}
-	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
 }
 
-// Applies softmax to a batched N-D `SparseTensor`.
-//
-// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
-// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
-//
-// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
-// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
-// zero elements do not participate*.  Specifically, the algorithm is equivalent
-// to the following:
-//
-//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
-//       with shape `[B, C]`, along the size-C dimension;
-//   (2) Masks out the original implicitly-zero locations;
-//   (3) Renormalizes the remaining elements.
-//
-// Hence, the `SparseTensor` result has exactly the same non-zero indices and
-// shape.
-//
-// Arguments:
-//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
-// SparseTensor, in canonical ordering.
-//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//
-// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
-func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSoftmax",
-		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
+type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
 
-// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
-type ResourceApplyAdagradDAAttr func(optionalAttr)
-
-// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+// ResourceSparseApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
+func ResourceSparseApplyFtrlV2UseLocking(value bool) ResourceSparseApplyFtrlV2Attr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the proximal adagrad scheme.
+// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+//
+// That is for rows we have grad for, we update var, accum and linear as follows:
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
 //	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
 //	lr: Scaling factor. Must be a scalar.
 //	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
+//	l2: L2 shrinkage regulariation. Must be a scalar.
+//
+//	lr_power: Scaling factor. Must be a scalar.
 //
 // Returns the created operation.
-func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
+func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14947,179 +19412,231 @@ func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagradDA",
+		Type: "ResourceSparseApplyFtrlV2",
 		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
+			var_, accum, linear, grad, indices, lr, l1, l2, l2_shrinkage, lr_power,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Computes the power of one value to another.
+// LoadTPUEmbeddingMDLAdagradLightParametersAttr is an optional argument to LoadTPUEmbeddingMDLAdagradLightParameters.
+type LoadTPUEmbeddingMDLAdagradLightParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingMDLAdagradLightParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
-// corresponding elements in `x` and `y`. For example:
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingMDLAdagradLightParametersTableId(value int64) LoadTPUEmbeddingMDLAdagradLightParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingMDLAdagradLightParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingMDLAdagradLightParametersTableName(value string) LoadTPUEmbeddingMDLAdagradLightParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load MDL Adagrad Light embedding parameters.
 //
-// ```
-// # tensor 'x' is [[2, 2]], [3, 3]]
-// # tensor 'y' is [[8, 16], [2, 3]]
-// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
-// ```
-func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the MDL Adagrad Light optimization algorithm.
+//	accumulators: Value of accumulators used in the MDL Adagrad Light optimization algorithm.
+//	weights: Value of weights used in the MDL Adagrad Light optimization algorithm.
+//	benefits: Value of benefits used in the MDL Adagrad Light optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingMDLAdagradLightParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, weights tf.Output, benefits tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingMDLAdagradLightParametersAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Pow",
+		Type: "LoadTPUEmbeddingMDLAdagradLightParameters",
 		Input: []tf.Input{
-			x, y,
+			parameters, accumulators, weights, benefits,
 		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// SpaceToDepthAttr is an optional argument to SpaceToDepth.
+type SpaceToDepthAttr func(optionalAttr)
+
+// SpaceToDepthDataFormat sets the optional data_format attribute to value.
+// If not specified, defaults to "NHWC"
+func SpaceToDepthDataFormat(value string) SpaceToDepthAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// SpaceToDepth for tensors of type T.
+//
+// Rearranges blocks of spatial data, into depth. More specifically,
+// this op outputs a copy of the input tensor where values from the `height`
+// and `width` dimensions are moved to the `depth` dimension.
+// The attr `block_size` indicates the input block size.
+//
+//   * Non-overlapping blocks of size `block_size x block size` are rearranged
+//     into depth at each location.
+//   * The depth of the output tensor is `block_size * block_size * input_depth`.
+//   * The Y, X coordinates within each block of the input become the high order
+//     component of the output channel index.
+//   * The input tensor's height and width must be divisible by block_size.
+//
+// The `data_format` attr specifies the layout of the input and output tensors
+// with the following options:
+//   "NHWC": `[ batch, height, width, channels ]`
+//   "NCHW": `[ batch, channels, height, width ]`
+//   "NCHW_VECT_C":
+//       `qint8 [ batch, channels / 4, height, width, 4 ]`
+//
+// It is useful to consider the operation as transforming a 6-D Tensor.
+// e.g. for data_format = NHWC,
+//      Each element in the input tensor can be specified via 6 coordinates,
+//      ordered by decreasing memory layout significance as:
+//      n,oY,bY,oX,bX,iC  (where n=batch index, oX, oY means X or Y coordinates
+//                         within the output image, bX, bY means coordinates
+//                         within the input block, iC means input channels).
+//      The output would be a transpose to the following layout:
+//      n,oY,oX,bY,bX,iC
+//
+// This operation is useful for resizing the activations between convolutions
+// (but keeping all data), e.g. instead of pooling. It is also useful for training
+// purely convolutional models.
+//
+// For example, given an input of shape `[1, 2, 2, 1]`, data_format = "NHWC" and
+// block_size = 2:
+//
+// ```
+// x = [[[[1], [2]],
+//       [[3], [4]]]]
+// ```
+//
+// This operation will output a tensor of shape `[1, 1, 1, 4]`:
+//
+// ```
+// [[[[1, 2, 3, 4]]]]
+// ```
+//
+// Here, the input has a batch of 1 and each batch element has shape `[2, 2, 1]`,
+// the corresponding output will have a single element (i.e. width and height are
+// both 1) and will have a depth of 4 channels (1 * block_size * block_size).
+// The output element shape is `[1, 1, 4]`.
+//
+// For an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// This operation, for block_size of 2, will return the following tensor of shape
+// `[1, 1, 1, 12]`
+//
+// ```
+// [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
+// ```
+//
+// Similarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:
+//
+// ```
+// x = [[[[1],   [2],  [5],  [6]],
+//       [[3],   [4],  [7],  [8]],
+//       [[9],  [10], [13],  [14]],
+//       [[11], [12], [15],  [16]]]]
+// ```
+//
+// the operator will return the following tensor of shape `[1 2 2 4]`:
+//
+// ```
+// x = [[[[1, 2, 3, 4],
+//        [5, 6, 7, 8]],
+//       [[9, 10, 11, 12],
+//        [13, 14, 15, 16]]]]
+// ```
+//
+// Arguments:
+//
+//	block_size: The size of the spatial block.
+func SpaceToDepth(scope *Scope, input tf.Output, block_size int64, optional ...SpaceToDepthAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"block_size": block_size}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SpaceToDepth",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Transforms a Tensor into a serialized TensorProto proto.
+// ParameterizedTruncatedNormalAttr is an optional argument to ParameterizedTruncatedNormal.
+type ParameterizedTruncatedNormalAttr func(optionalAttr)
+
+// ParameterizedTruncatedNormalSeed sets the optional seed attribute to value.
 //
-// Arguments:
-//	tensor: A Tensor of type `T`.
-//
-// Returns A serialized TensorProto proto of the input tensor.
-func SerializeTensor(scope *Scope, tensor tf.Output) (serialized tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SerializeTensor",
-		Input: []tf.Input{
-			tensor,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns up to `num_records` (key, value) pairs produced by a Reader.
-//
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
-// It may return less than `num_records` even before the last batch.
-//
-// Arguments:
-//	reader_handle: Handle to a `Reader`.
-//	queue_handle: Handle to a `Queue`, with string work items.
-//	num_records: number of records to read from `Reader`.
-//
-// Returns A 1-D tensor.A 1-D tensor.
-func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output, num_records tf.Output) (keys tf.Output, values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderReadUpToV2",
-		Input: []tf.Input{
-			reader_handle, queue_handle, num_records,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// CudnnRNNBackpropAttr is an optional argument to CudnnRNNBackprop.
-type CudnnRNNBackpropAttr func(optionalAttr)
-
-// CudnnRNNBackpropRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNBackpropRnnMode(value string) CudnnRNNBackpropAttr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNBackpropInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNBackpropInputMode(value string) CudnnRNNBackpropAttr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
-
-// CudnnRNNBackpropDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNBackpropDirection(value string) CudnnRNNBackpropAttr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
-
-// CudnnRNNBackpropDropout sets the optional dropout attribute to value.
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
 // If not specified, defaults to 0
-func CudnnRNNBackpropDropout(value float32) CudnnRNNBackpropAttr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
-	}
-}
-
-// CudnnRNNBackpropSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropSeed(value int64) CudnnRNNBackpropAttr {
+func ParameterizedTruncatedNormalSeed(value int64) ParameterizedTruncatedNormalAttr {
 	return func(m optionalAttr) {
 		m["seed"] = value
 	}
 }
 
-// CudnnRNNBackpropSeed2 sets the optional seed2 attribute to value.
+// ParameterizedTruncatedNormalSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
 // If not specified, defaults to 0
-func CudnnRNNBackpropSeed2(value int64) CudnnRNNBackpropAttr {
+func ParameterizedTruncatedNormalSeed2(value int64) ParameterizedTruncatedNormalAttr {
 	return func(m optionalAttr) {
 		m["seed2"] = value
 	}
 }
 
-// Backprop step of CudnnRNN.
+// Outputs random values from a normal distribution. The parameters may each be a
 //
-// Compute the backprop of both data and weights in a RNN.
+// scalar which applies to the entire output, or a vector of length shape[0] which
+// stores the parameters for each batch.
 //
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//     the actual computation before the first layer. 'skip_input' is only allowed
-//     when input_size == num_units; 'auto_select' implies 'skip_input' when
-//     input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used. Should be
-//   "unidirectional" or "bidirectional".
-// dropout: Dropout probability. When set to 0., dropout is disabled.
-// seed: The 1st part of a seed to initialize dropout.
-// seed2: The 2nd part of a seed to initialize dropout.
-// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
-// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
-//     num_units].
-// input_c: For LSTM, a 3-D tensor with the shape of
-//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
-// params: A 1-D tensor that contains the weights and biases in an opaque layout.
-//     The size must be created through CudnnRNNParamsSize, and initialized
-//     separately. Note that they might not be compatible across different
-//     generations. So it is a good idea to save and restore
-// output: A 3-D tensor with the shape of [seq_length, batch_size,
-//     dir * num_units].
-// output_h: The same shape has input_h.
-// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
-// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
-//     pass.
-// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
-//     pass.
-// reserve_space: The same reserve_space produced in for forward operation.
-// input_backprop: The backprop to input in the forward pass. Has the same shape
-//     as input.
-// input_h_backprop: The backprop to input_h in the forward pass. Has the same
-//     shape as input_h.
-// input_c_backprop: The backprop to input_c in the forward pass. Has the same
-//     shape as input_c.
-// params_backprop: The backprop to the params buffer in the forward pass. Has the
-//     same shape as params.
-func CudnnRNNBackprop(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, optional ...CudnnRNNBackpropAttr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
+// Arguments:
+//	shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
+//	means: The mean parameter of each batch.
+//	stdevs: The standard deviation parameter of each batch. Must be greater than 0.
+//	minvals: The minimum cutoff. May be -infinity.
+//	maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
+// for each batch.
+//
+// Returns A matrix of shape num_batches x samples_per_batch, filled with random
+// truncated normal values using the parameters for each row.
+func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output, stdevs tf.Output, minvals tf.Output, maxvals tf.Output, optional ...ParameterizedTruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15128,59 +19645,9 @@ func CudnnRNNBackprop(scope *Scope, input tf.Output, input_h tf.Output, input_c
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CudnnRNNBackprop",
+		Type: "ParameterizedTruncatedNormal",
 		Input: []tf.Input{
-			input, input_h, input_c, params, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// Table initializer that takes two tensors for keys and values respectively.
-//
-// Arguments:
-//	table_handle: Handle to a table which will be initialized.
-//	keys: Keys of type Tkey.
-//	values: Values of type Tval.
-//
-// Returns the created operation.
-func InitializeTableV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InitializeTableV2",
-		Input: []tf.Input{
-			table_handle, keys, values,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes the gradient of morphological 2-D dilation with respect to the input.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
-//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
-// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
-// Must be: `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D with shape `[batch, in_height, in_width, depth]`.
-func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (in_backprop tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "Dilation2DBackpropInput",
-		Input: []tf.Input{
-			input, filter, out_backprop,
+			shape, means, stdevs, minvals, maxvals,
 		},
 		Attrs: attrs,
 	}
@@ -15188,10 +19655,328 @@ func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, ou
 	return op.Output(0)
 }
 
-// MaxPoolGradGradV2Attr is an optional argument to MaxPoolGradGradV2.
-type MaxPoolGradGradV2Attr func(optionalAttr)
+// RetrieveTPUEmbeddingFTRLParametersAttr is an optional argument to RetrieveTPUEmbeddingFTRLParameters.
+type RetrieveTPUEmbeddingFTRLParametersAttr func(optionalAttr)
 
-// MaxPoolGradGradV2DataFormat sets the optional data_format attribute to value.
+// RetrieveTPUEmbeddingFTRLParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingFTRLParametersTableId(value int64) RetrieveTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingFTRLParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingFTRLParametersTableName(value string) RetrieveTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve FTRL embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the FTRL optimization algorithm.Parameter accumulators updated by the FTRL optimization algorithm.Parameter linears updated by the FTRL optimization algorithm.
+func RetrieveTPUEmbeddingFTRLParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingFTRLParametersAttr) (parameters tf.Output, accumulators tf.Output, linears tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingFTRLParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Inverse fast Fourier transform.
+//
+// Computes the inverse 1-dimensional discrete Fourier transform over the
+// inner-most dimension of `input`.
+//
+// Arguments:
+//	input: A complex tensor.
+//
+// Returns A complex tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its inverse 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft
+// @end_compatibility
+func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IFFT",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CudnnRNNCanonicalToParamsAttr is an optional argument to CudnnRNNCanonicalToParams.
+type CudnnRNNCanonicalToParamsAttr func(optionalAttr)
+
+// CudnnRNNCanonicalToParamsRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNCanonicalToParamsRnnMode(value string) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNCanonicalToParamsInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNCanonicalToParamsInputMode(value string) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNCanonicalToParamsDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNCanonicalToParamsDirection(value string) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNCanonicalToParamsDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNCanonicalToParamsDropout(value float32) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNCanonicalToParamsSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNCanonicalToParamsSeed(value int64) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNCanonicalToParamsSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNCanonicalToParamsSeed2(value int64) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Converts CudnnRNN params from canonical form to usable form.
+//
+// Writes a set of weights into the opaque params buffer so they can be used in
+// upcoming training or inferences.
+//
+// Note that the params buffer may not be compatible across different GPUs. So any
+// save and restoration should be converted to and from the canonical weights and
+// biases.
+//
+// num_layers: Specifies the number of layers in the RNN model.
+// num_units: Specifies the size of the hidden state.
+// input_size: Specifies the size of the input state.
+// weights: the canonical form of weights that can be used for saving
+//     and restoration. They are more likely to be compatible across different
+//     generations.
+// biases: the canonical form of biases that can be used for saving
+//     and restoration. They are more likely to be compatible across different
+//     generations.
+// num_params: number of parameter sets for all layers.
+//     Each layer may contain multiple parameter sets, with each set consisting of
+//     a weight matrix and a bias vector.
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//     The actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//     dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+func CudnnRNNCanonicalToParams(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, weights []tf.Output, biases []tf.Output, optional ...CudnnRNNCanonicalToParamsAttr) (params tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNNCanonicalToParams",
+		Input: []tf.Input{
+			num_layers, num_units, input_size, tf.OutputList(weights), tf.OutputList(biases),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CropAndResizeAttr is an optional argument to CropAndResize.
+type CropAndResizeAttr func(optionalAttr)
+
+// CropAndResizeMethod sets the optional method attribute to value.
+//
+// value: A string specifying the sampling method for resizing. It can be either
+// `"bilinear"` or `"nearest"` and default to `"bilinear"`. Currently two sampling
+// methods are supported: Bilinear and Nearest Neighbor.
+// If not specified, defaults to "bilinear"
+func CropAndResizeMethod(value string) CropAndResizeAttr {
+	return func(m optionalAttr) {
+		m["method"] = value
+	}
+}
+
+// CropAndResizeExtrapolationValue sets the optional extrapolation_value attribute to value.
+//
+// value: Value used for extrapolation, when applicable.
+// If not specified, defaults to 0
+func CropAndResizeExtrapolationValue(value float32) CropAndResizeAttr {
+	return func(m optionalAttr) {
+		m["extrapolation_value"] = value
+	}
+}
+
+// Extracts crops from the input image tensor and resizes them.
+//
+// Extracts crops from the input image tensor and resizes them using bilinear
+// sampling or nearest neighbor sampling (possibly with aspect ratio change) to a
+// common output size specified by `crop_size`. This is more general than the
+// `crop_to_bounding_box` op which extracts a fixed size slice from the input image
+// and does not allow resizing or aspect ratio change.
+//
+// Returns a tensor with `crops` from the input `image` at positions defined at the
+// bounding box locations in `boxes`. The cropped boxes are all resized (with
+// bilinear or nearest neighbor interpolation) to a fixed
+// `size = [crop_height, crop_width]`. The result is a 4-D tensor
+// `[num_boxes, crop_height, crop_width, depth]`. The resizing is corner aligned.
+// In particular, if `boxes = [[0, 0, 1, 1]]`, the method will give identical
+// results to using `tf.image.resize_bilinear()` or
+// `tf.image.resize_nearest_neighbor()`(depends on the `method` argument) with
+// `align_corners=True`.
+//
+// Arguments:
+//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+// Both `image_height` and `image_width` need to be positive.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1]` in image height coordinates. We do allow `y1` > `y2`, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
+// cropped image patches are resized to this size. The aspect ratio of the image
+// content is not preserved. Both `crop_height` and `crop_width` need to be
+// positive.
+//
+// Returns A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Output, crop_size tf.Output, optional ...CropAndResizeAttr) (crops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CropAndResize",
+		Input: []tf.Input{
+			image, boxes, box_ind, crop_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Connects N inputs to an N-way replicated TPU computation.
+func TPUReplicatedInput(scope *Scope, inputs []tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TPUReplicatedInput",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
+//
+// Each comparison returns a boolean `true` (if `input_value > threshold`)
+// or and `false` otherwise.
+//
+// This operation is useful for Locality-Sensitive-Hashing (LSH) and other
+// algorithms that use hashing approximations of cosine and `L2` distances;
+// codes can be generated from an input via:
+//
+// ```python
+// codebook_size = 50
+// codebook_bits = codebook_size * 32
+// codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits],
+//                            dtype=x.dtype,
+//                            initializer=tf.orthogonal_initializer())
+// codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)
+// codes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32
+// # now codes has shape x.shape[:-1] + [codebook_size]
+// ```
+//
+// **NOTE**: Currently, the innermost dimension of the tensor must be divisible
+// by 8.
+//
+// Given an `input` shaped `[s0, s1, ..., s_n]`, the output is
+// a `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`.
+//
+// Arguments:
+//	input: Values to compare against `threshold` and bitpack.
+//	threshold: Threshold to compare against.
+//
+// Returns The bitpacked comparisons.
+func CompareAndBitpack(scope *Scope, input tf.Output, threshold tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "CompareAndBitpack",
+		Input: []tf.Input{
+			input, threshold,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPoolGradGradAttr is an optional argument to MaxPoolGradGrad.
+type MaxPoolGradGradAttr func(optionalAttr)
+
+// MaxPoolGradGradDataFormat sets the optional data_format attribute to value.
 //
 // value: Specify the data format of the input and output data. With the
 // default format "NHWC", the data is stored in the order of:
@@ -15199,7 +19984,7 @@ type MaxPoolGradGradV2Attr func(optionalAttr)
 // Alternatively, the format could be "NCHW", the data storage order of:
 //     [batch, in_channels, in_height, in_width].
 // If not specified, defaults to "NHWC"
-func MaxPoolGradGradV2DataFormat(value string) MaxPoolGradGradV2Attr {
+func MaxPoolGradGradDataFormat(value string) MaxPoolGradGradAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
@@ -15217,18 +20002,18 @@ func MaxPoolGradGradV2DataFormat(value string) MaxPoolGradGradV2Attr {
 //	padding: The type of padding algorithm to use.
 //
 // Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradGradV2Attr) (output tf.Output) {
+func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"padding": padding}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGradV2",
+		Type: "MaxPoolGradGrad",
 		Input: []tf.Input{
-			orig_input, orig_output, grad, ksize, strides,
+			orig_input, orig_output, grad,
 		},
 		Attrs: attrs,
 	}
@@ -15236,205 +20021,38 @@ func MaxPoolGradGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output
 	return op.Output(0)
 }
 
-// AngleAttr is an optional argument to Angle.
-type AngleAttr func(optionalAttr)
-
-// AngleTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func AngleTout(value tf.DataType) AngleAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Returns the argument of a complex number.
+// Says whether the targets are in the top `K` predictions.
 //
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the argument of each element in `input`. All elements in
-// `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part.
+// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+// prediction for the target class is among the top `k` predictions among
+// all predictions for example `i`. Note that the behavior of `InTopK` differs
+// from the `TopK` op in its handling of ties; if multiple classes have the
+// same prediction value and straddle the top-`k` boundary, all of those
+// classes are considered to be in the top `k`.
 //
-// The argument returned by this operation is of the form \\(atan2(b, a)\\).
+// More formally, let
 //
-// For example:
+//   \\(predictions_i\\) be the predictions for all classes for example `i`,
+//   \\(targets_i\\) be the target class for example `i`,
+//   \\(out_i\\) be the output for example `i`,
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.angle(input) ==> [2.0132, 1.056]
-// ```
-//
-// @compatibility(numpy)
-// Equivalent to np.angle.
-// @end_compatibility
-func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Angle",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceStridedSliceAssignAttr is an optional argument to ResourceStridedSliceAssign.
-type ResourceStridedSliceAssignAttr func(optionalAttr)
-
-// ResourceStridedSliceAssignBeginMask sets the optional begin_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignBeginMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["begin_mask"] = value
-	}
-}
-
-// ResourceStridedSliceAssignEndMask sets the optional end_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignEndMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["end_mask"] = value
-	}
-}
-
-// ResourceStridedSliceAssignEllipsisMask sets the optional ellipsis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignEllipsisMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["ellipsis_mask"] = value
-	}
-}
-
-// ResourceStridedSliceAssignNewAxisMask sets the optional new_axis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignNewAxisMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["new_axis_mask"] = value
-	}
-}
-
-// ResourceStridedSliceAssignShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignShrinkAxisMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["shrink_axis_mask"] = value
-	}
-}
-
-// Assign `value` to the sliced l-value reference of `ref`.
-//
-// The values of `value` are assigned to the positions in the variable
-// `ref` that are selected by the slice parameters. The slice parameters
-// `begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
-//
-// NOTE this op currently does not support broadcasting and so `value`'s
-// shape must be exactly the shape produced by the slice of `ref`.
-//
-// Returns the created operation.
-func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, end tf.Output, strides tf.Output, value tf.Output, optional ...ResourceStridedSliceAssignAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceStridedSliceAssign",
-		Input: []tf.Input{
-			ref, begin, end, strides, value,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns immutable tensor from memory region.
-//
-// The current implementation memmaps the tensor from a file.
+// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
 //
 // Arguments:
-//	dtype: Type of the returned tensor.
-//	shape: Shape of the returned tensor.
-//	memory_region_name: Name of readonly memory region used by the tensor, see
-// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
-func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
+//	predictions: A `batch_size` x `classes` tensor.
+//	targets: A `batch_size` vector of class ids.
+//	k: Number of top elements to look at for computing precision.
+//
+// Returns Computed Precision at `k` as a `bool Tensor`.
+func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (precision tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
+	attrs := map[string]interface{}{"k": k}
 	opspec := tf.OpSpec{
-		Type: "ImmutableConst",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Checks whether a tree has been initialized.
-//
-// Arguments:
-//	tree_handle: Handle to the tree.
-//
-// Returns Whether the tree is initialized.
-func TensorForestTreeIsInitializedOp(scope *Scope, tree_handle tf.Output) (is_initialized tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorForestTreeIsInitializedOp",
+		Type: "InTopK",
 		Input: []tf.Input{
-			tree_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ImagAttr is an optional argument to Imag.
-type ImagAttr func(optionalAttr)
-
-// ImagTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ImagTout(value tf.DataType) ImagAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Returns the imaginary part of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the imaginary part of each element in `input`. All
-// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part returned by this operation.
-//
-// For example:
-//
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.imag(input) ==> [4.75, 5.75]
-// ```
-func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Imag",
-		Input: []tf.Input{
-			input,
+			predictions, targets,
 		},
 		Attrs: attrs,
 	}
@@ -15442,126 +20060,43 @@ func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output
 	return op.Output(0)
 }
 
-// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
-//
-// Arguments:
-//
-//	num_threads: Identifies the number of threads to use for the private threadpool.
-//
-//
-func ExperimentalPrivateThreadPoolDataset(scope *Scope, input_dataset tf.Output, num_threads tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalPrivateThreadPoolDataset",
-		Input: []tf.Input{
-			input_dataset, num_threads,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// SparseReduceSumAttr is an optional argument to SparseReduceSum.
+type SparseReduceSumAttr func(optionalAttr)
 
-// DenseToSparseSetOperationAttr is an optional argument to DenseToSparseSetOperation.
-type DenseToSparseSetOperationAttr func(optionalAttr)
-
-// DenseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func DenseToSparseSetOperationValidateIndices(value bool) DenseToSparseSetOperationAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Applies set operation along last dimension of `Tensor` and `SparseTensor`.
+// SparseReduceSumKeepDims sets the optional keep_dims attribute to value.
 //
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
-//
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
-//
-// If `validate_indices` is `True`, this op validates the order and range of `set2`
-// indices.
-//
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
-//
-// Arguments:
-//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the
-// max set size across `n-1` dimensions.
-//
-//
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...DenseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DenseToSparseSetOperation",
-		Input: []tf.Input{
-			set1, set2_indices, set2_values, set2_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// ResizeBicubicAttr is an optional argument to ResizeBicubic.
-type ResizeBicubicAttr func(optionalAttr)
-
-// ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
+// value: If true, retain reduced dimensions with length 1.
 // If not specified, defaults to false
-func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
+func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// ResizeBicubicHalfPixelCenters sets the optional half_pixel_centers attribute to value.
-// If not specified, defaults to false
-func ResizeBicubicHalfPixelCenters(value bool) ResizeBicubicAttr {
-	return func(m optionalAttr) {
-		m["half_pixel_centers"] = value
-	}
-}
-
-// Resize `images` to `size` using bicubic interpolation.
+// Computes the sum of elements across dimensions of a SparseTensor.
 //
-// Input images can be of different types but output images are always float.
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
+// instead of a sparse one.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBicubicAttr) (resized_images tf.Output) {
+// Returns `R-K`-D.  The reduced Tensor.
+func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15570,9 +20105,9 @@ func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...R
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBicubic",
+		Type: "SparseReduceSum",
 		Input: []tf.Input{
-			images, size,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
 		Attrs: attrs,
 	}
@@ -15580,134 +20115,76 @@ func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...R
 	return op.Output(0)
 }
 
-// RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingAdagradParametersGradAccumDebug.
-type RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
+// Updates the tree ensemble by either adding a layer to the last tree being grown
 //
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve Adagrad embedding parameters with debug support.
+// or by starting a new tree.
 //
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
+// Arguments:
+//	tree_ensemble_handle: Handle to the ensemble variable.
+//	feature_ids: Rank 1 tensor with ids for each feature. This is the real id of
+// the feature that will be used in the split.
+//	node_ids: List of rank 1 tensors representing the nodes for which this feature
+// has a split.
+//	gains: List of rank 1 tensors representing the gains for each of the feature's
+// split.
+//	thresholds: List of rank 1 tensors representing the thesholds for each of the
+// feature's split.
+//	left_node_contribs: List of rank 2 tensors with left leaf contribs for each of
+// the feature's splits. Will be added to the previous node values to constitute
+// the values of the left nodes.
+//	right_node_contribs: List of rank 2 tensors with right leaf contribs for each
+// of the feature's splits. Will be added to the previous node values to constitute
+// the values of the right nodes.
+//	max_depth: Max depth of the tree to build.
+//	learning_rate: shrinkage const for each new tree.
+//	pruning_mode: 0-No pruning, 1-Pre-pruning, 2-Post-pruning.
 //
-// Returns Parameter parameters updated by the Adagrad optimization algorithm.Parameter accumulators updated by the Adagrad optimization algorithm.Parameter gradient_accumulators updated by the Adagrad optimization algorithm.
-func RetrieveTPUEmbeddingAdagradParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output) {
+// Returns the created operation.
+func BoostedTreesUpdateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, feature_ids tf.Output, node_ids []tf.Output, gains []tf.Output, thresholds []tf.Output, left_node_contribs []tf.Output, right_node_contribs []tf.Output, max_depth tf.Output, learning_rate tf.Output, pruning_mode int64) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"pruning_mode": pruning_mode}
 	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingAdagradParametersGradAccumDebug",
-
+		Type: "BoostedTreesUpdateEnsemble",
+		Input: []tf.Input{
+			tree_ensemble_handle, feature_ids, tf.OutputList(node_ids), tf.OutputList(gains), tf.OutputList(thresholds), tf.OutputList(left_node_contribs), tf.OutputList(right_node_contribs), max_depth, learning_rate,
+		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// MatrixTriangularSolveAttr is an optional argument to MatrixTriangularSolve.
-type MatrixTriangularSolveAttr func(optionalAttr)
+// BiasAddAttr is an optional argument to BiasAdd.
+type BiasAddAttr func(optionalAttr)
 
-// MatrixTriangularSolveLower sets the optional lower attribute to value.
+// BiasAddDataFormat sets the optional data_format attribute to value.
 //
-// value: Boolean indicating whether the innermost matrices in `matrix` are
-// lower or upper triangular.
-// If not specified, defaults to true
-func MatrixTriangularSolveLower(value bool) MatrixTriangularSolveAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the bias tensor will be added to the last dimension
+// of the value tensor.
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// The tensor will be added to "in_channels", the third-to-the-last
+//     dimension.
+// If not specified, defaults to "NHWC"
+func BiasAddDataFormat(value string) BiasAddAttr {
 	return func(m optionalAttr) {
-		m["lower"] = value
+		m["data_format"] = value
 	}
 }
 
-// MatrixTriangularSolveAdjoint sets the optional adjoint attribute to value.
+// Adds `bias` to `value`.
 //
-// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
-//          adjoint.
-//
-// @compatibility(numpy)
-// Equivalent to scipy.linalg.solve_triangular
-// @end_compatibility
-// If not specified, defaults to false
-func MatrixTriangularSolveAdjoint(value bool) MatrixTriangularSolveAttr {
-	return func(m optionalAttr) {
-		m["adjoint"] = value
-	}
-}
-
-// Solves systems of linear equations with upper or lower triangular matrices by backsubstitution.
-//
-//
-// `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
-// square matrices. If `lower` is `True` then the strictly upper triangular part
-// of each inner-most matrix is assumed to be zero and not accessed.
-// If `lower` is False then the strictly lower triangular part of each inner-most
-// matrix is assumed to be zero and not accessed.
-// `rhs` is a tensor of shape `[..., M, K]`.
-//
-// The output is a tensor of shape `[..., M, K]`. If `adjoint` is
-// `True` then the innermost matrices in `output` satisfy matrix equations
-// `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-// If `adjoint` is `False` then the strictly then the  innermost matrices in
-// `output` satisfy matrix equations
-// `adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
-//
-// Example:
-// ```python
-//
-// a = tf.constant([[3,  0,  0,  0],
-//                  [2,  1,  0,  0],
-//                  [1,  0,  1,  0],
-//                  [1,  1,  1,  1]], dtype=tf.float32)
-//
-// b = tf.constant([[4],
-//                  [2],
-//                  [4],
-//                  [2]], dtype=tf.float32)
-//
-// x = tf.linalg.triangular_solve(a, b, lower=True)
-// x
-// # <tf.Tensor: id=257, shape=(4, 1), dtype=float32, numpy=
-// # array([[ 1.3333334 ],
-// #        [-0.66666675],
-// #        [ 2.6666665 ],
-// #        [-1.3333331 ]], dtype=float32)>
-//
-// # in python3 one can use `a@x`
-// tf.matmul(a, x)
-// # <tf.Tensor: id=263, shape=(4, 1), dtype=float32, numpy=
-// # array([[4.       ],
-// #        [2.       ],
-// #        [4.       ],
-// #        [1.9999999]], dtype=float32)>
-// ```
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
 //
 // Arguments:
-//	matrix: Shape is `[..., M, M]`.
-//	rhs: Shape is `[..., M, K]`.
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
 //
-// Returns Shape is `[..., M, K]`.
-func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixTriangularSolveAttr) (output tf.Output) {
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15716,9 +20193,9 @@ func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, option
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixTriangularSolve",
+		Type: "BiasAdd",
 		Input: []tf.Input{
-			matrix, rhs,
+			value, bias,
 		},
 		Attrs: attrs,
 	}
@@ -15726,323 +20203,82 @@ func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, option
 	return op.Output(0)
 }
 
-// Produces a string handle for the given MultiDeviceIterator.
+// Returns true if queue is closed.
+//
+// This operation returns true if the queue is closed and false if the queue
+// is open.
 //
 // Arguments:
-//	multi_device_iterator: A MultiDeviceIterator resource.
-//
-// Returns A string representing the resource.
-func MultiDeviceIteratorToStringHandle(scope *Scope, multi_device_iterator tf.Output) (string_handle tf.Output) {
+//	handle: The handle to a queue.
+func QueueIsClosedV2(scope *Scope, handle tf.Output) (is_closed tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MultiDeviceIteratorToStringHandle",
+		Type: "QueueIsClosedV2",
 		Input: []tf.Input{
-			multi_device_iterator,
+			handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// LoadTPUEmbeddingFTRLParametersAttr is an optional argument to LoadTPUEmbeddingFTRLParameters.
-type LoadTPUEmbeddingFTRLParametersAttr func(optionalAttr)
+// OrderedMapUnstageNoKeyAttr is an optional argument to OrderedMapUnstageNoKey.
+type OrderedMapUnstageNoKeyAttr func(optionalAttr)
 
-// LoadTPUEmbeddingFTRLParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
+// OrderedMapUnstageNoKeyCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingFTRLParametersTableId(value int64) LoadTPUEmbeddingFTRLParametersAttr {
+// REQUIRES: value >= 0
+func OrderedMapUnstageNoKeyCapacity(value int64) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
-		m["table_id"] = value
+		m["capacity"] = value
 	}
 }
 
-// LoadTPUEmbeddingFTRLParametersTableName sets the optional table_name attribute to value.
+// OrderedMapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapUnstageNoKeyMemoryLimit(value int64) OrderedMapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapUnstageNoKeyContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func LoadTPUEmbeddingFTRLParametersTableName(value string) LoadTPUEmbeddingFTRLParametersAttr {
+func OrderedMapUnstageNoKeyContainer(value string) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
-		m["table_name"] = value
+		m["container"] = value
 	}
 }
 
-// Load FTRL embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the FTRL optimization algorithm.
-//	accumulators: Value of accumulators used in the FTRL optimization algorithm.
-//	linears: Value of linears used in the FTRL optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, linears tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingFTRLParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingFTRLParameters",
-		Input: []tf.Input{
-			parameters, accumulators, linears,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// LoadTPUEmbeddingRMSPropParametersAttr is an optional argument to LoadTPUEmbeddingRMSPropParameters.
-type LoadTPUEmbeddingRMSPropParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingRMSPropParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingRMSPropParametersTableId(value int64) LoadTPUEmbeddingRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingRMSPropParametersTableName sets the optional table_name attribute to value.
+// OrderedMapUnstageNoKeySharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func LoadTPUEmbeddingRMSPropParametersTableName(value string) LoadTPUEmbeddingRMSPropParametersAttr {
+func OrderedMapUnstageNoKeySharedName(value string) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
-		m["table_name"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Load RMSProp embedding parameters.
+// Op removes and returns the (key, value) element with the smallest
 //
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the RMSProp optimization algorithm.
-//	ms: Value of ms used in the RMSProp optimization algorithm.
-//	mom: Value of mom used in the RMSProp optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingRMSPropParameters(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingRMSPropParametersAttr) (o *tf.Operation) {
+// key from the underlying container.   If the underlying container
+// does not contain elements, the op will block until it does.
+func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingRMSPropParameters",
+		Type: "OrderedMapUnstageNoKey",
 		Input: []tf.Input{
-			parameters, ms, mom,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
-type ResourceSparseApplyAdagradAttr func(optionalAttr)
-
-// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceSparseApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
-// If not specified, defaults to true
-func ResourceSparseApplyAdagradUpdateSlots(value bool) ResourceSparseApplyAdagradAttr {
-	return func(m optionalAttr) {
-		m["update_slots"] = value
-	}
-}
-
-// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
-//
-// That is for rows we have grad for, we update var and accum as follows:
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//
-// Returns the created operation.
-func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagrad",
-		Input: []tf.Input{
-			var_, accum, lr, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// TensorListStackAttr is an optional argument to TensorListStack.
-type TensorListStackAttr func(optionalAttr)
-
-// TensorListStackNumElements sets the optional num_elements attribute to value.
-// If not specified, defaults to -1
-func TensorListStackNumElements(value int64) TensorListStackAttr {
-	return func(m optionalAttr) {
-		m["num_elements"] = value
-	}
-}
-
-// Stacks all tensors in the list.
-//
-// Requires that all tensors have the same shape.
-//
-// input_handle: the input list
-// tensor: the gathered result
-// num_elements: optional. If not -1, the number of elements in the list.
-//
-func TensorListStack(scope *Scope, input_handle tf.Output, element_shape tf.Output, element_dtype tf.DataType, optional ...TensorListStackAttr) (tensor tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListStack",
-		Input: []tf.Input{
-			input_handle, element_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Fetches multiple values from infeed as an XLA tuple.
-//
-// Arguments:
-//	dtypes: The element types of each element in `outputs`.
-//	shapes: The shapes of each tensor in `outputs`.
-//
-// Returns A list of tensors that will be provided using the infeed mechanism.
-func InfeedDequeueTuple(scope *Scope, dtypes []tf.DataType, shapes []tf.Shape) (outputs []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes, "shapes": shapes}
-	opspec := tf.OpSpec{
-		Type: "InfeedDequeueTuple",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("InfeedDequeueTuple", err)
-		return
-	}
-	return outputs
-}
-
-// LoadTPUEmbeddingMomentumParametersAttr is an optional argument to LoadTPUEmbeddingMomentumParameters.
-type LoadTPUEmbeddingMomentumParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingMomentumParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingMomentumParametersTableId(value int64) LoadTPUEmbeddingMomentumParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingMomentumParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingMomentumParametersTableName(value string) LoadTPUEmbeddingMomentumParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load Momentum embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the Momentum optimization algorithm.
-//	momenta: Value of momenta used in the Momentum optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingMomentumParameters(scope *Scope, parameters tf.Output, momenta tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingMomentumParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingMomentumParameters",
-		Input: []tf.Input{
-			parameters, momenta,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Gets the next output from the given iterator .
-func IteratorGetNext(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "IteratorGetNext",
-		Input: []tf.Input{
-			iterator,
+			indices,
 		},
 		Attrs: attrs,
 	}
@@ -16052,109 +20288,40 @@ func IteratorGetNext(scope *Scope, iterator tf.Output, output_types []tf.DataTyp
 	}
 	var idx int
 	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("IteratorGetNext", err)
+	key = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("OrderedMapUnstageNoKey", err)
 		return
 	}
-	return components
+	return key, values
 }
 
-// Computes rectified linear 6 gradients for a Relu6 operation.
+// Creates a dataset that shuffles and repeats elements from `input_dataset`
+//
+// pseudorandomly.
 //
 // Arguments:
-//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
-//	features: The features passed as input to the corresponding Relu6 operation, or
-// its output; using either one produces the same result.
 //
-// Returns The gradients:
-// `gradients * (features > 0) * (features < 6)`.
-func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+//	buffer_size: The number of output elements to buffer in an iterator over
+// this dataset. Compare with the `min_after_dequeue` attr when creating a
+// `RandomShuffleQueue`.
+//	seed: A scalar seed for the random number generator. If either `seed` or
+// `seed2` is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
+//	count: A scalar representing the number of times the underlying dataset
+// should be repeated. The default is `-1`, which results in infinite repetition.
+//
+//
+func ShuffleAndRepeatDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Relu6Grad",
+		Type: "ShuffleAndRepeatDataset",
 		Input: []tf.Input{
-			gradients, features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the truth value of NOT x element-wise.
-func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LogicalNot",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SparseToDenseAttr is an optional argument to SparseToDense.
-type SparseToDenseAttr func(optionalAttr)
-
-// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
-//
-// value: If true, indices are checked to make sure they are sorted in
-// lexicographic order and that there are no repeats.
-// If not specified, defaults to true
-func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Converts a sparse representation into a dense tensor.
-//
-// Builds an array `dense` with shape `output_shape` such that
-//
-// ```
-// # If sparse_indices is scalar
-// dense[i] = (i == sparse_indices ? sparse_values : default_value)
-//
-// # If sparse_indices is a vector, then for each i
-// dense[sparse_indices[i]] = sparse_values[i]
-//
-// # If sparse_indices is an n by d matrix, then for each i in [0, n)
-// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
-// ```
-//
-// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
-// scalar, all sparse indices are set to this single value.
-//
-// Indices should be sorted in lexicographic order, and indices must not
-// contain any repeats. If `validate_indices` is true, these properties
-// are checked during execution.
-//
-// Arguments:
-//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
-// index where `sparse_values[i]` will be placed.
-//	output_shape: 1-D.  Shape of the dense output tensor.
-//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
-// or a scalar value to be used for all sparse indices.
-//	default_value: Scalar value to set for indices not specified in
-// `sparse_indices`.
-//
-// Returns Dense output tensor of shape `output_shape`.
-func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseToDense",
-		Input: []tf.Input{
-			sparse_indices, output_shape, sparse_values, default_value,
+			input_dataset, buffer_size, seed, seed2, count,
 		},
 		Attrs: attrs,
 	}
@@ -16264,27 +20431,124 @@ func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Creates a dataset that changes the batch size.
-//
-// Creates a dataset that changes the batch size of the dataset to current batch
-// size // num_workers.
+// Computes the number of elements in the given queue.
 //
 // Arguments:
-//	input_dataset: A variant tensor representing the input dataset.
-//	num_workers: A scalar representing the number of workers to distribute this batch across. As
-// a result of this transformation the current batch size would end up being
-// divided  by this parameter.
+//	handle: The handle to a queue.
 //
-//
-func ExperimentalRebatchDataset(scope *Scope, input_dataset tf.Output, num_workers tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns The number of elements in the given queue.
+func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalRebatchDataset",
+		Type: "QueueSizeV2",
 		Input: []tf.Input{
-			input_dataset, num_workers,
+			handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingRMSPropParametersGradAccumDebug.
+type LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load RMSProp embedding parameters with debug support.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the RMSProp optimization algorithm.
+//	ms: Value of ms used in the RMSProp optimization algorithm.
+//	mom: Value of mom used in the RMSProp optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the RMSProp optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingRMSPropParametersGradAccumDebug(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingRMSPropParametersGradAccumDebug",
+		Input: []tf.Input{
+			parameters, ms, mom, gradient_accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// A TPU core selector Op.
+//
+// This Op produces a set of TPU cores (for warm-up) or a single TPU core
+// (for regular inference) to execute the TPU program on. The output is
+// consumed by TPUPartitionedCall.
+//
+// Returns A vector 1 or more TPU cores.
+func TPUOrdinalSelector(scope *Scope) (device_ordinals tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TPUOrdinalSelector",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LeakyReluAttr is an optional argument to LeakyRelu.
+type LeakyReluAttr func(optionalAttr)
+
+// LeakyReluAlpha sets the optional alpha attribute to value.
+// If not specified, defaults to 0.2
+func LeakyReluAlpha(value float32) LeakyReluAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// Computes rectified linear: `max(features, features * alpha)`.
+func LeakyRelu(scope *Scope, features tf.Output, optional ...LeakyReluAttr) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LeakyRelu",
+		Input: []tf.Input{
+			features,
 		},
 		Attrs: attrs,
 	}
@@ -16292,323 +20556,610 @@ func ExperimentalRebatchDataset(scope *Scope, input_dataset tf.Output, num_worke
 	return op.Output(0)
 }
 
-// Adds `bias` to `value`.
+// ResourceApplyKerasMomentumAttr is an optional argument to ResourceApplyKerasMomentum.
+type ResourceApplyKerasMomentumAttr func(optionalAttr)
+
+// ResourceApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
 //
-// This is a deprecated version of BiasAdd and will be soon removed.
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyKerasMomentumUseLocking(value bool) ResourceApplyKerasMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
 //
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
+// value: If `True`, the tensor passed to compute grad will be
+// var + momentum * accum, so in the end, the var you get is actually
+// var + momentum * accum.
+// If not specified, defaults to false
+func ResourceApplyKerasMomentumUseNesterov(value bool) ResourceApplyKerasMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
+//
+// want to use Nesterov momentum.
+//
+// accum = accum * momentum - lr * grad
+// var += accum
 //
 // Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//	momentum: Momentum. Must be a scalar.
 //
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output) {
+// Returns the created operation.
+func ResourceApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyKerasMomentumAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyKerasMomentum",
+		Input: []tf.Input{
+			var_, accum, lr, grad, momentum,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// QuantizedResizeBilinearAttr is an optional argument to QuantizedResizeBilinear.
+type QuantizedResizeBilinearAttr func(optionalAttr)
+
+// QuantizedResizeBilinearAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func QuantizedResizeBilinearAlignCorners(value bool) QuantizedResizeBilinearAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// QuantizedResizeBilinearHalfPixelCenters sets the optional half_pixel_centers attribute to value.
+// If not specified, defaults to false
+func QuantizedResizeBilinearHalfPixelCenters(value bool) QuantizedResizeBilinearAttr {
+	return func(m optionalAttr) {
+		m["half_pixel_centers"] = value
+	}
+}
+
+// Resize quantized `images` to `size` using quantized bilinear interpolation.
+//
+// Input images and output images must be quantized types.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+//
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min tf.Output, max tf.Output, optional ...QuantizedResizeBilinearAttr) (resized_images tf.Output, out_min tf.Output, out_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedResizeBilinear",
+		Input: []tf.Input{
+			images, size, min, max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns element-wise remainder of division. This emulates C semantics in that
+//
+// the result here is consistent with a truncating divide. E.g.
+// `tf.truncatediv(x, y) * y + truncate_mod(x, y) = x`.
+//
+// *NOTE*: `Mod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BiasAddV1",
+		Type: "Mod",
 		Input: []tf.Input{
-			value, bias,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Selects elements from `x` or `y`, depending on `condition`.
+// Applies sparse addition to `input` using individual values or slices
 //
-// The `x`, and `y` tensors must all have the same shape, and the
-// output will also have that shape.
+// from `updates` according to indices `indices`.  The updates are non-aliasing:
+// `input` is only modified in-place if no other operations will use it.
+// Otherwise, a copy of `input` is made.  This operation has a gradient with
+// respect to both `input` and `updates`.
 //
-// The `condition` tensor must be a scalar if `x` and `y` are scalars.
-// If `x` and `y` are vectors or higher rank, then `condition` must be either a
-// scalar, a vector with size matching the first dimension of `x`, or must have
-// the same shape as `x`.
+// `input` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
 //
-// The `condition` tensor acts as a mask that chooses, based on the value at each
-// element, whether the corresponding element / row in the output should be
-// taken from `x` (if true) or `y` (if false).
+// `indices` must be integer tensor, containing indices into `input`.
+// It must be shape \\([d_0, ..., d_{Q-2}, K]\\) where `0 < K <= P`.
 //
-// If `condition` is a vector and `x` and `y` are higher rank matrices, then
-// it chooses which row (outer dimension) to copy from `x` and `y`.
-// If `condition` has the same shape as `x` and `y`, then it chooses which
-// element to copy from `x` and `y`.
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or `(P-K)`-dimensional slices
+// (if `K < P`) along the `K`th dimension of `input`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// $$[d_0, ..., d_{Q-2}, input.shape[K], ..., input.shape[P-1]].$$
+//
+// For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
+// elements. In Python, that addition would look like this:
+//
+//     input = tf.constant([1, 2, 3, 4, 5, 6, 7, 8])
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     output = tf.scatter_nd_non_aliasing_add(input, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(output))
+//
+// The resulting value `output` would look like this:
+//
+//     [1, 13, 3, 14, 14, 6, 7, 20]
+//
+// See `tf.scatter_nd` for more details about how to make updates to slices.
+//
+// Arguments:
+//	input: A Tensor.
+//	indices: A Tensor. Must be one of the following types: `int32`, `int64`.
+// A tensor of indices into `input`.
+//	updates: A Tensor. Must have the same type as ref. A tensor of updated values
+// to add to `input`.
+//
+// Returns A `Tensor` with the same shape as `input`, containing values of `input`
+// updated with `updates`.
+func ScatterNdNonAliasingAdd(scope *Scope, input tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ScatterNdNonAliasingAdd",
+		Input: []tf.Input{
+			input, indices, updates,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceScatterNdSubAttr is an optional argument to ResourceScatterNdSub.
+type ResourceScatterNdSubAttr func(optionalAttr)
+
+// ResourceScatterNdSubUseLocking sets the optional use_locking attribute to value.
+//
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdSubUseLocking(value bool) ResourceScatterNdSubAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Applies sparse subtraction to individual values or slices in a Variable.
+//
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// ```
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
+// ```
+//
+// For example, say we want to subtract 4 scattered elements from a rank-1 tensor
+// with 8 elements. In Python, that subtraction would look like this:
+//
+// ```python
+// ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
+// indices = tf.constant([[4], [3], [1], [7]])
+// updates = tf.constant([9, 10, 11, 12])
+// sub = tf.scatter_nd_sub(ref, indices, updates)
+// with tf.Session() as sess:
+//   print sess.run(sub)
+// ```
+//
+// The resulting update to ref would look like this:
+//
+//     [1, -9, 3, -6, -4, 6, 7, -4]
+//
+// See `tf.scatter_nd` for more details about how to make updates to
+// slices.
+//
+// Arguments:
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of
+// values to add to ref.
+//
+// Returns the created operation.
+func ResourceScatterNdSub(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdSubAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterNdSub",
+		Input: []tf.Input{
+			ref, indices, updates,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// StringFormatAttr is an optional argument to StringFormat.
+type StringFormatAttr func(optionalAttr)
+
+// StringFormatTemplate sets the optional template attribute to value.
+//
+// value: A string, the template to format tensor summaries into.
+// If not specified, defaults to "%s"
+func StringFormatTemplate(value string) StringFormatAttr {
+	return func(m optionalAttr) {
+		m["template"] = value
+	}
+}
+
+// StringFormatPlaceholder sets the optional placeholder attribute to value.
+//
+// value: A string, at each placeholder in the template a subsequent tensor summary will be inserted.
+// If not specified, defaults to "%s"
+func StringFormatPlaceholder(value string) StringFormatAttr {
+	return func(m optionalAttr) {
+		m["placeholder"] = value
+	}
+}
+
+// StringFormatSummarize sets the optional summarize attribute to value.
+//
+// value: When formatting the tensor summaries print the first and last summarize entries of each tensor dimension.
+// If not specified, defaults to 3
+func StringFormatSummarize(value int64) StringFormatAttr {
+	return func(m optionalAttr) {
+		m["summarize"] = value
+	}
+}
+
+// Formats a string template using a list of tensors.
+//
+// Formats a string template using a list of tensors, pretty-printing tensor summaries.
+//
+// Arguments:
+//	inputs: The list of tensors to format into the placeholder string.
+//
+// Returns = The resulting string scalar.
+func StringFormat(scope *Scope, inputs []tf.Output, optional ...StringFormatAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringFormat",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Real-valued fast Fourier transform.
+//
+// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most dimension of `input`.
+//
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
+// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
+// followed by the `fft_length / 2` positive-frequency terms.
+//
+// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
+//
+// Returns A complex64 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
+//   frequency components of its 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfft
+// @end_compatibility
+func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RFFT",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes acos of x element-wise.
+func Acos(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Acos",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CudnnRNNV3Attr is an optional argument to CudnnRNNV3.
+type CudnnRNNV3Attr func(optionalAttr)
+
+// CudnnRNNV3RnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNV3RnnMode(value string) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNV3InputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNV3InputMode(value string) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNV3Direction sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNV3Direction(value string) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNV3Dropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNV3Dropout(value float32) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNV3Seed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNV3Seed(value int64) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNV3Seed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNV3Seed2(value int64) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// CudnnRNNV3IsTraining sets the optional is_training attribute to value.
+// If not specified, defaults to true
+func CudnnRNNV3IsTraining(value bool) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// CudnnRNNV3TimeMajor sets the optional time_major attribute to value.
+// If not specified, defaults to true
+func CudnnRNNV3TimeMajor(value bool) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["time_major"] = value
+	}
+}
+
+// A RNN backed by cuDNN.
+//
+// Computes the RNN from the input and initial states, with respect to the params
+// buffer. Accepts one extra input "sequence_lengths" than CudnnRNN.
+//
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicates whether there is a linear projection between the input and
+//   the actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: If time_major is true, this is a 3-D tensor with the shape of
+//     [seq_length, batch_size, input_size]. If time_major is false, the shape is
+//     [batch_size, seq_length, input_size].
+// input_h: If time_major is true, this is a 3-D tensor with the shape of
+//     [num_layer * dir, batch_size, num_units]. If time_major is false, the shape
+//     is [batch_size, num_layer * dir, num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// sequence_lengths: a vector of lengths of each input sequence.
+// output: If time_major is true, this is a 3-D tensor with the shape of
+//     [seq_length, batch_size, dir * num_units]. If time_major is false, the
+//     shape is [batch_size, seq_length, dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// is_training: Indicates whether this operation is used for inferenece or
+//   training.
+// time_major: Indicates whether the input/output format is time major or batch
+//     major.
+// reserve_space: An opaque tensor that can be used in backprop calculation. It
+//   is only produced if is_training is true.
+func CudnnRNNV3(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, sequence_lengths tf.Output, optional ...CudnnRNNV3Attr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output, host_reserved tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNNV3",
+		Input: []tf.Input{
+			input, input_h, input_c, params, sequence_lengths,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// Deprecated. Use TensorArraySizeV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArraySizeV3
+func TensorArraySizeV2(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArraySizeV2",
+		Input: []tf.Input{
+			handle, flow_in,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Bucketizes 'input' based on 'boundaries'.
+//
+// For example, if the inputs are
+//     boundaries = [0, 10, 100]
+//     input = [[-5, 10000]
+//              [150,   10]
+//              [5,    100]]
+//
+// then the output will be
+//     output = [[0, 3]
+//               [3, 2]
+//               [1, 3]]
+//
+// Arguments:
+//	input: Any shape of Tensor contains with int or float type.
+//	boundaries: A sorted list of floats gives the boundary of the buckets.
+//
+// Returns Same shape with 'input', each value of input replaced with bucket index.
+//
+// @compatibility(numpy)
+// Equivalent to np.digitize.
+// @end_compatibility
+func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"boundaries": boundaries}
+	opspec := tf.OpSpec{
+		Type: "Bucketize",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Pads a tensor.
+//
+// This operation pads `input` according to the `paddings` and `constant_values`
+// you specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is
+// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many padding values to add before the contents of `input` in that dimension,
+// and `paddings[D, 1]` indicates how many padding values to add after the contents
+// of `input` in that dimension. `constant_values` is a scalar tensor of the same
+// type as `input` that indicates the value to use for padding `input`.
+//
+// The padded size of each dimension D of the output is:
+//
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
 //
 // For example:
 //
-// ```python
-// # 'condition' tensor is [[True,  False]
-// #                        [False, True]]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e)  # => [[1, 6], [7, 4]]
-//
-//
-// # 'condition' tensor is [True, False]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e) ==> [[1, 2],
-//                              [7, 8]]
-//
 // ```
-//
-// Arguments:
-//
-//	x: = A `Tensor` which may have the same shape as `condition`.
-// If `condition` is rank 1, `x` may have higher rank,
-// but its first dimension must match the size of `condition`.
-//	y: = A `Tensor` with the same type and shape as `x`.
-//
-// Returns = A `Tensor` with the same type and shape as `x` and `y`.
-func Select(scope *Scope, condition tf.Output, x tf.Output, y tf.Output) (output tf.Output) {
+// # 't' is [[1, 1], [2, 2]]
+// # 'paddings' is [[1, 1], [2, 2]]
+// # 'constant_values' is 0
+// # rank of 't' is 2
+// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+//                       [0, 0, 1, 1, 0, 0]
+//                       [0, 0, 2, 2, 0, 0]
+//                       [0, 0, 0, 0, 0, 0]]
+// ```
+func PadV2(scope *Scope, input tf.Output, paddings tf.Output, constant_values tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Select",
+		Type: "PadV2",
 		Input: []tf.Input{
-			condition, x, y,
+			input, paddings, constant_values,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug.
-type LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr func(optionalAttr)
-
-// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load proximal Adagrad embedding parameters with debug support.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the proximal Adagrad optimization algorithm.
-//	accumulators: Value of accumulators used in the proximal Adagrad optimization algorithm.
-//	gradient_accumulators: Value of gradient_accumulators used in the proximal Adagrad optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug",
-		Input: []tf.Input{
-			parameters, accumulators, gradient_accumulators,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// NonDeterministicIntsAttr is an optional argument to NonDeterministicInts.
-type NonDeterministicIntsAttr func(optionalAttr)
-
-// NonDeterministicIntsDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_INT64
-func NonDeterministicIntsDtype(value tf.DataType) NonDeterministicIntsAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Non-deterministically generates some integers.
-//
-// This op may use some OS-provided source of non-determinism (e.g. an RNG), so each execution will give different results.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//
-// Returns Non-deterministic integer values with specified shape.
-func NonDeterministicInts(scope *Scope, shape tf.Output, optional ...NonDeterministicIntsAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "NonDeterministicInts",
-		Input: []tf.Input{
-			shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
-type AddManySparseToTensorsMapAttr func(optionalAttr)
-
-// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
-//
-// value: The container name for the `SparseTensorsMap` created by this op.
-// If not specified, defaults to ""
-func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
-//
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
-// If not specified, defaults to ""
-func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
-//
-// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`, where
-//
-// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
-//
-// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
-// having a first `sparse_indices` column taking values between `[0, N)`, where
-// the minibatch size `N == sparse_shape[0]`.
-//
-// The input `SparseTensor` must have rank `R` greater than 1, and the first
-// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The stored
-// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
-// will have rank `R-1`.
-//
-// The `SparseTensor` values can then be read out as part of a minibatch by passing
-// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddManySparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
-//
-// Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-// The minibatch size `N == sparse_shape[0]`.
-//
-// Returns 1-D.  The handles of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.  Shape: `[N]`.
-func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AddManySparseToTensorsMap",
-		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LoadTPUEmbeddingADAMParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingADAMParametersGradAccumDebug.
-type LoadTPUEmbeddingADAMParametersGradAccumDebugAttr func(optionalAttr)
-
-// LoadTPUEmbeddingADAMParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingADAMParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingADAMParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingADAMParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingADAMParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingADAMParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load ADAM embedding parameters with debug support.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the ADAM optimization algorithm.
-//	momenta: Value of momenta used in the ADAM optimization algorithm.
-//	velocities: Value of velocities used in the ADAM optimization algorithm.
-//	gradient_accumulators: Value of gradient_accumulators used in the ADAM optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingADAMParametersGradAccumDebug(scope *Scope, parameters tf.Output, momenta tf.Output, velocities tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingADAMParametersGradAccumDebugAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingADAMParametersGradAccumDebug",
-		Input: []tf.Input{
-			parameters, momenta, velocities, gradient_accumulators,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingAdagradParametersGradAccumDebug.
 type LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr func(optionalAttr)
 
@@ -16664,31 +21215,41 @@ func LoadTPUEmbeddingAdagradParametersGradAccumDebug(scope *Scope, parameters tf
 	return scope.AddOperation(opspec)
 }
 
-// StatefulStandardNormalAttr is an optional argument to StatefulStandardNormal.
-type StatefulStandardNormalAttr func(optionalAttr)
+// Computes tan of x element-wise.
+func Tan(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Tan",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// StatefulStandardNormalDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatefulStandardNormalDtype(value tf.DataType) StatefulStandardNormalAttr {
+// QuantizedReluAttr is an optional argument to QuantizedRelu.
+type QuantizedReluAttr func(optionalAttr)
+
+// QuantizedReluOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluOutType(value tf.DataType) QuantizedReluAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["out_type"] = value
 	}
 }
 
-// Outputs random values from a normal distribution. This op is deprecated in favor of op 'StatefulStandardNormalV2'
-//
-// DEPRECATED at GraphDef version 29: Use StatefulStandardNormalV2 instead
-//
-// The generated values will have mean 0 and standard deviation 1.
+// Computes Quantized Rectified Linear: `max(features, 0)`
 //
 // Arguments:
-//	resource: The handle of the resource variable that stores the state of the RNG.
-//	shape: The shape of the output tensor.
 //
-// Returns A tensor of the specified shape filled with random normal values.
-func StatefulStandardNormal(scope *Scope, resource tf.Output, shape tf.Output, optional ...StatefulStandardNormalAttr) (output tf.Output) {
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16697,97 +21258,9 @@ func StatefulStandardNormal(scope *Scope, resource tf.Output, shape tf.Output, o
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatefulStandardNormal",
+		Type: "QuantizedRelu",
 		Input: []tf.Input{
-			resource, shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Shuts down a running distributed TPU system.
-//
-// The op returns an error if no system is running.
-//
-// Returns the created operation.
-func ShutdownDistributedTPU(scope *Scope) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ShutdownDistributedTPU",
-	}
-	return scope.AddOperation(opspec)
-}
-
-// LogUniformCandidateSamplerAttr is an optional argument to LogUniformCandidateSampler.
-type LogUniformCandidateSamplerAttr func(optionalAttr)
-
-// LogUniformCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func LogUniformCandidateSamplerSeed(value int64) LogUniformCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// LogUniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func LogUniformCandidateSamplerSeed2(value int64) LogUniformCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Generates labels for candidate sampling with a log-uniform distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
-//
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
-//
-// Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
-//
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func LogUniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LogUniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LogUniformCandidateSampler",
-		Input: []tf.Input{
-			true_classes,
+			features, min_features, max_features,
 		},
 		Attrs: attrs,
 	}
@@ -16795,502 +21268,6 @@ func LogUniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true i
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// LoadTPUEmbeddingAdadeltaParametersAttr is an optional argument to LoadTPUEmbeddingAdadeltaParameters.
-type LoadTPUEmbeddingAdadeltaParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingAdadeltaParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingAdadeltaParametersTableId(value int64) LoadTPUEmbeddingAdadeltaParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingAdadeltaParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingAdadeltaParametersTableName(value string) LoadTPUEmbeddingAdadeltaParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load Adadelta embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the Adadelta optimization algorithm.
-//	accumulators: Value of accumulators used in the Adadelta optimization algorithm.
-//	updates: Value of updates used in the Adadelta optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingAdadeltaParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, updates tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdadeltaParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingAdadeltaParameters",
-		Input: []tf.Input{
-			parameters, accumulators, updates,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Reduces sparse updates into the variable referenced by `resource` using the `min` operation.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] = min(ref[indices, ...], updates[...])
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] = min(ref[indices[i], ...], updates[i, ...])
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] = min(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions are combined.
-//
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
-//
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
-//
-// Returns the created operation.
-func ResourceScatterMin(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterMin",
-		Input: []tf.Input{
-			resource, indices, updates,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Worker heartbeat op.
-//
-// Heartbeats may be sent periodically to indicate the coordinator is still active,
-// to retrieve the current worker status and to expedite shutdown when necessary.
-//
-// Arguments:
-//	request: A string tensor containing a serialized WorkerHeartbeatRequest
-//
-// Returns A string tensor containing a serialized WorkerHeartbeatResponse
-func WorkerHeartbeat(scope *Scope, request tf.Output) (response tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "WorkerHeartbeat",
-		Input: []tf.Input{
-			request,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DecodeRawAttr is an optional argument to DecodeRaw.
-type DecodeRawAttr func(optionalAttr)
-
-// DecodeRawLittleEndian sets the optional little_endian attribute to value.
-//
-// value: Whether the input `bytes` are in little-endian order.
-// Ignored for `out_type` values that are stored in a single byte like
-// `uint8`.
-// If not specified, defaults to true
-func DecodeRawLittleEndian(value bool) DecodeRawAttr {
-	return func(m optionalAttr) {
-		m["little_endian"] = value
-	}
-}
-
-// Reinterpret the bytes of a string as a vector of numbers.
-//
-// Arguments:
-//	bytes: All the elements must have the same length.
-//
-//
-// Returns A Tensor with one more dimension than the input `bytes`.  The
-// added dimension will have size equal to the length of the elements
-// of `bytes` divided by the number of bytes to represent `out_type`.
-func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"out_type": out_type}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeRaw",
-		Input: []tf.Input{
-			bytes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Gives a guarantee to the TF runtime that the input tensor is a constant.
-//
-// The runtime is then free to make optimizations based on this.
-//
-// Only accepts value typed tensors as inputs and rejects resource variable handles
-// as input.
-//
-// Returns the input tensor without modification.
-func GuaranteeConst(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "GuaranteeConst",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TensorArrayV2Attr is an optional argument to TensorArrayV2.
-type TensorArrayV2Attr func(optionalAttr)
-
-// TensorArrayV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
-	}
-}
-
-// TensorArrayV2DynamicSize sets the optional dynamic_size attribute to value.
-// If not specified, defaults to false
-func TensorArrayV2DynamicSize(value bool) TensorArrayV2Attr {
-	return func(m optionalAttr) {
-		m["dynamic_size"] = value
-	}
-}
-
-// TensorArrayV2ClearAfterRead sets the optional clear_after_read attribute to value.
-// If not specified, defaults to true
-func TensorArrayV2ClearAfterRead(value bool) TensorArrayV2Attr {
-	return func(m optionalAttr) {
-		m["clear_after_read"] = value
-	}
-}
-
-// TensorArrayV2TensorArrayName sets the optional tensor_array_name attribute to value.
-// If not specified, defaults to ""
-func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
-	return func(m optionalAttr) {
-		m["tensor_array_name"] = value
-	}
-}
-
-// Deprecated. Use TensorArrayV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArrayV3
-func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayV2",
-		Input: []tf.Input{
-			size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
-//
-// The hash function is deterministic on the content of the string within the
-// process and will never change. However, it is not suitable for cryptography.
-// This function may be used when CPU time is scarce and inputs are trusted or
-// unimportant. There is a risk of adversaries constructing inputs that all hash
-// to the same bucket. To prevent this problem, use a strong hash function with
-// `tf.string_to_hash_bucket_strong`.
-//
-// Arguments:
-//	input: The strings to assign a hash bucket.
-//	num_buckets: The number of buckets.
-//
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets}
-	opspec := tf.OpSpec{
-		Type: "StringToHashBucketFast",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AsStringAttr is an optional argument to AsString.
-type AsStringAttr func(optionalAttr)
-
-// AsStringPrecision sets the optional precision attribute to value.
-//
-// value: The post-decimal precision to use for floating point numbers.
-// Only used if precision > -1.
-// If not specified, defaults to -1
-func AsStringPrecision(value int64) AsStringAttr {
-	return func(m optionalAttr) {
-		m["precision"] = value
-	}
-}
-
-// AsStringScientific sets the optional scientific attribute to value.
-//
-// value: Use scientific notation for floating point numbers.
-// If not specified, defaults to false
-func AsStringScientific(value bool) AsStringAttr {
-	return func(m optionalAttr) {
-		m["scientific"] = value
-	}
-}
-
-// AsStringShortest sets the optional shortest attribute to value.
-//
-// value: Use shortest representation (either scientific or standard) for
-// floating point numbers.
-// If not specified, defaults to false
-func AsStringShortest(value bool) AsStringAttr {
-	return func(m optionalAttr) {
-		m["shortest"] = value
-	}
-}
-
-// AsStringWidth sets the optional width attribute to value.
-//
-// value: Pad pre-decimal numbers to this width.
-// Applies to both floating point and integer numbers.
-// Only used if width > -1.
-// If not specified, defaults to -1
-func AsStringWidth(value int64) AsStringAttr {
-	return func(m optionalAttr) {
-		m["width"] = value
-	}
-}
-
-// AsStringFill sets the optional fill attribute to value.
-//
-// value: The value to pad if width > -1.  If empty, pads with spaces.
-// Another typical value is '0'.  String cannot be longer than 1 character.
-// If not specified, defaults to ""
-func AsStringFill(value string) AsStringAttr {
-	return func(m optionalAttr) {
-		m["fill"] = value
-	}
-}
-
-// Converts each entry in the given tensor to strings.  Supports many numeric
-//
-// types and boolean.
-func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AsString",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Constructs an Optional variant from a tuple of tensors.
-func OptionalFromValue(scope *Scope, components []tf.Output) (optional tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "OptionalFromValue",
-		Input: []tf.Input{
-			tf.OutputList(components),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// A TPU core selector Op.
-//
-// This Op produces a set of TPU cores (for warm-up) or a single TPU core
-// (for regular inference) to execute the TPU program on. The output is
-// consumed by TPUPartitionedCall.
-//
-// Returns A vector 1 or more TPU cores.
-func TPUOrdinalSelector(scope *Scope) (device_ordinals tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TPUOrdinalSelector",
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TopKAttr is an optional argument to TopK.
-type TopKAttr func(optionalAttr)
-
-// TopKSorted sets the optional sorted attribute to value.
-//
-// value: If true the resulting `k` elements will be sorted by the values in
-// descending order.
-// If not specified, defaults to true
-func TopKSorted(value bool) TopKAttr {
-	return func(m optionalAttr) {
-		m["sorted"] = value
-	}
-}
-
-// Finds values and indices of the `k` largest elements for the last dimension.
-//
-// DEPRECATED at GraphDef version 7: Use TopKV2 instead
-//
-// If the input is a vector (rank-1), finds the `k` largest entries in the vector
-// and outputs their values and indices as vectors.  Thus `values[j]` is the
-// `j`-th largest entry in `input`, and its index is `indices[j]`.
-//
-// For matrices (resp. higher rank input), computes the top `k` entries in each
-// row (resp. vector along the last dimension).  Thus,
-//
-//     values.shape = indices.shape = input.shape[:-1] + [k]
-//
-// If two elements are equal, the lower-index element appears first.
-//
-// If `k` varies dynamically, use `TopKV2` below.
-//
-// Arguments:
-//	input: 1-D or higher with last dimension at least `k`.
-//	k: Number of top elements to look for along the last dimension (along each
-// row for matrices).
-//
-// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
-func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values tf.Output, indices tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"k": k}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TopK",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// LoadTPUEmbeddingMDLAdagradLightParametersAttr is an optional argument to LoadTPUEmbeddingMDLAdagradLightParameters.
-type LoadTPUEmbeddingMDLAdagradLightParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingMDLAdagradLightParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingMDLAdagradLightParametersTableId(value int64) LoadTPUEmbeddingMDLAdagradLightParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingMDLAdagradLightParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingMDLAdagradLightParametersTableName(value string) LoadTPUEmbeddingMDLAdagradLightParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load MDL Adagrad Light embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the MDL Adagrad Light optimization algorithm.
-//	accumulators: Value of accumulators used in the MDL Adagrad Light optimization algorithm.
-//	weights: Value of weights used in the MDL Adagrad Light optimization algorithm.
-//	benefits: Value of benefits used in the MDL Adagrad Light optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingMDLAdagradLightParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, weights tf.Output, benefits tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingMDLAdagradLightParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingMDLAdagradLightParameters",
-		Input: []tf.Input{
-			parameters, accumulators, weights, benefits,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // UnicodeTranscodeAttr is an optional argument to UnicodeTranscode.
 type UnicodeTranscodeAttr func(optionalAttr)
 
@@ -17393,38 +21370,29 @@ func UnicodeTranscode(scope *Scope, input tf.Output, input_encoding string, outp
 	return op.Output(0)
 }
 
-// ResourceApplyAddSignAttr is an optional argument to ResourceApplyAddSign.
-type ResourceApplyAddSignAttr func(optionalAttr)
+// ResourceApplyGradientDescentAttr is an optional argument to ResourceApplyGradientDescent.
+type ResourceApplyGradientDescentAttr func(optionalAttr)
 
-// ResourceApplyAddSignUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyGradientDescentUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var and m tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
+// value: If `True`, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
 // If not specified, defaults to false
-func ResourceApplyAddSignUseLocking(value bool) ResourceApplyAddSignAttr {
+func ResourceApplyGradientDescentUseLocking(value bool) ResourceApplyGradientDescentAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the AddSign update.
-//
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// update <- (alpha + sign_decay * sign(g) *sign(m)) * g
-// variable <- variable - lr_t * update
+// Update '*var' by subtracting 'alpha' * 'delta' from it.
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	alpha: Must be a scalar.
-//	sign_decay: Must be a scalar.
-//	beta: Must be a scalar.
-//	grad: The gradient.
+//	alpha: Scaling factor. Must be a scalar.
+//	delta: The change.
 //
 // Returns the created operation.
-func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, alpha tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyAddSignAttr) (o *tf.Operation) {
+func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, optional ...ResourceApplyGradientDescentAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17433,37 +21401,232 @@ func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Outpu
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAddSign",
+		Type: "ResourceApplyGradientDescent",
 		Input: []tf.Input{
-			var_, m, lr, alpha, sign_decay, beta, grad,
+			var_, alpha, delta,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingMomentumParametersGradAccumDebug.
-type LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
+// MaxPoolGradV2Attr is an optional argument to MaxPoolGradV2.
+type MaxPoolGradV2Attr func(optionalAttr)
 
-// LoadTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// MaxPoolGradV2DataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradV2DataFormat(value string) MaxPoolGradV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of the maxpooling function.
+//
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolGradV2",
+		Input: []tf.Input{
+			orig_input, orig_output, grad, ksize, strides,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes softsign: `features / (abs(features) + 1)`.
+func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Softsign",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes sin of x element-wise.
+func Sin(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sin",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizeAndDequantizeAttr is an optional argument to QuantizeAndDequantize.
+type QuantizeAndDequantizeAttr func(optionalAttr)
+
+// QuantizeAndDequantizeSignedInput sets the optional signed_input attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeSignedInput(value bool) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["signed_input"] = value
+	}
+}
+
+// QuantizeAndDequantizeNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func QuantizeAndDequantizeNumBits(value int64) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// QuantizeAndDequantizeRangeGiven sets the optional range_given attribute to value.
+// If not specified, defaults to false
+func QuantizeAndDequantizeRangeGiven(value bool) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["range_given"] = value
+	}
+}
+
+// QuantizeAndDequantizeInputMin sets the optional input_min attribute to value.
+// If not specified, defaults to 0
+func QuantizeAndDequantizeInputMin(value float32) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["input_min"] = value
+	}
+}
+
+// QuantizeAndDequantizeInputMax sets the optional input_max attribute to value.
+// If not specified, defaults to 0
+func QuantizeAndDequantizeInputMax(value float32) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["input_max"] = value
+	}
+}
+
+// Use QuantizeAndDequantizeV2 instead.
+//
+// DEPRECATED at GraphDef version 22: Replaced by QuantizeAndDequantizeV2
+func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAndDequantizeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizeAndDequantize",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TopKV2Attr is an optional argument to TopKV2.
+type TopKV2Attr func(optionalAttr)
+
+// TopKV2Sorted sets the optional sorted attribute to value.
+//
+// value: If true the resulting `k` elements will be sorted by the values in
+// descending order.
+// If not specified, defaults to true
+func TopKV2Sorted(value bool) TopKV2Attr {
+	return func(m optionalAttr) {
+		m["sorted"] = value
+	}
+}
+
+// Finds values and indices of the `k` largest elements for the last dimension.
+//
+// If the input is a vector (rank-1), finds the `k` largest entries in the vector
+// and outputs their values and indices as vectors.  Thus `values[j]` is the
+// `j`-th largest entry in `input`, and its index is `indices[j]`.
+//
+// For matrices (resp. higher rank input), computes the top `k` entries in each
+// row (resp. vector along the last dimension).  Thus,
+//
+//     values.shape = indices.shape = input.shape[:-1] + [k]
+//
+// If two elements are equal, the lower-index element appears first.
+//
+// Arguments:
+//	input: 1-D or higher with last dimension at least `k`.
+//	k: 0-D.  Number of top elements to look for along the last dimension (along each
+// row for matrices).
+//
+// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
+func TopKV2(scope *Scope, input tf.Output, k tf.Output, optional ...TopKV2Attr) (values tf.Output, indices tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TopKV2",
+		Input: []tf.Input{
+			input, k,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// LoadTPUEmbeddingADAMParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingADAMParametersGradAccumDebug.
+type LoadTPUEmbeddingADAMParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingADAMParametersGradAccumDebugTableId sets the optional table_id attribute to value.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
-func LoadTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+func LoadTPUEmbeddingADAMParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingADAMParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
 		m["table_id"] = value
 	}
 }
 
-// LoadTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// LoadTPUEmbeddingADAMParametersGradAccumDebugTableName sets the optional table_name attribute to value.
 // If not specified, defaults to ""
-func LoadTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+func LoadTPUEmbeddingADAMParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingADAMParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
 		m["table_name"] = value
 	}
 }
 
-// Load Momentum embedding parameters with debug support.
+// Load ADAM embedding parameters with debug support.
 //
 // An op that loads optimization parameters into HBM for embedding. Must be
 // preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
@@ -17472,14 +21635,15 @@ func LoadTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) Loa
 // executed.
 //
 // Arguments:
-//	parameters: Value of parameters used in the Momentum optimization algorithm.
-//	momenta: Value of momenta used in the Momentum optimization algorithm.
-//	gradient_accumulators: Value of gradient_accumulators used in the Momentum optimization algorithm.
+//	parameters: Value of parameters used in the ADAM optimization algorithm.
+//	momenta: Value of momenta used in the ADAM optimization algorithm.
+//	velocities: Value of velocities used in the ADAM optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the ADAM optimization algorithm.
 //
 //
 //
 // Returns the created operation.
-func LoadTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr) (o *tf.Operation) {
+func LoadTPUEmbeddingADAMParametersGradAccumDebug(scope *Scope, parameters tf.Output, momenta tf.Output, velocities tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingADAMParametersGradAccumDebugAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17488,37 +21652,1523 @@ func LoadTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, parameters t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingMomentumParametersGradAccumDebug",
+		Type: "LoadTPUEmbeddingADAMParametersGradAccumDebug",
 		Input: []tf.Input{
-			parameters, momenta, gradient_accumulators,
+			parameters, momenta, velocities, gradient_accumulators,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// LoadTPUEmbeddingProximalAdagradParametersAttr is an optional argument to LoadTPUEmbeddingProximalAdagradParameters.
-type LoadTPUEmbeddingProximalAdagradParametersAttr func(optionalAttr)
+// Computes the Bessel i0e function of `x` element-wise.
+//
+// Exponentially scaled modified Bessel function of order 0 defined as
+// `bessel_i0e(x) = exp(-abs(x)) bessel_i0(x)`.
+//
+// This function is faster and numerically stabler than `bessel_i0(x)`.
+func BesselI0e(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BesselI0e",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// LoadTPUEmbeddingProximalAdagradParametersTableId sets the optional table_id attribute to value.
+// Check if the input matches the regex pattern.
+//
+// The input is a string tensor of any shape. The pattern is a scalar
+// string tensor which is applied to every element of the input tensor.
+// The boolean values (True or False) of the output tensor indicate
+// if the input matches the regex pattern provided.
+//
+// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+//
+// Arguments:
+//	input: A string tensor of the text to be processed.
+//	pattern: A scalar string tensor containing the regular expression to match the input.
+//
+// Returns A bool tensor with the same shape as `input`.
+func RegexFullMatch(scope *Scope, input tf.Output, pattern tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RegexFullMatch",
+		Input: []tf.Input{
+			input, pattern,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the sum along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output[i] = \sum_{j...} data[j...]\\) where the sum is over tuples `j...` such
+// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
+// need not be sorted and need not cover all values in the full
+// range of valid values.
+//
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+// If the given segment ID `i` is negative, the value is dropped and will not be
+// added to the sum of the segment.
+//
+// `num_segments` should equal the number of distinct segment IDs.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
+// </div>
+//
+// ``` python
+// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+// tf.unsorted_segment_sum(c, tf.constant([0, 1, 0]), num_segments=2)
+// # ==> [[ 5,  5, 5, 5],
+// #       [5,  6, 7, 8]]
+// ```
+//
+//
+// Arguments:
+//
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnsortedSegmentSum",
+		Input: []tf.Input{
+			data, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Split a `SparseTensor` into `num_split` tensors along one dimension.
+//
+// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
+// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
+// For example, if `split_dim = 1` and `num_split = 2` and the input is
+//
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
+//
+// Graphically the output tensors are:
+//
+//     output_tensor[0] = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
+//
+//     output_tensor[1] = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
+//
+// Arguments:
+//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
+// `[0, rank(shape))`.
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
+//	num_split: The number of ways to split.
+//
+// Returns A list of 1-D tensors represents the values of the output sparse
+// tensors.A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_split": num_split}
+	opspec := tf.OpSpec{
+		Type: "SparseSplit",
+		Input: []tf.Input{
+			split_dim, indices, values, shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	return output_indices, output_values, output_shape
+}
+
+// UnicodeDecodeAttr is an optional argument to UnicodeDecode.
+type UnicodeDecodeAttr func(optionalAttr)
+
+// UnicodeDecodeErrors sets the optional errors attribute to value.
+//
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeDecodeErrors(value string) UnicodeDecodeAttr {
+	return func(m optionalAttr) {
+		m["errors"] = value
+	}
+}
+
+// UnicodeDecodeReplacementChar sets the optional replacement_char attribute to value.
+//
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD or U+65533.)
+// If not specified, defaults to 65533
+func UnicodeDecodeReplacementChar(value int64) UnicodeDecodeAttr {
+	return func(m optionalAttr) {
+		m["replacement_char"] = value
+	}
+}
+
+// UnicodeDecodeReplaceControlCharacters sets the optional replace_control_characters attribute to value.
+//
+// value: Whether to replace the C0 control characters (00-1F) with the
+// `replacement_char`. Default is false.
+// If not specified, defaults to false
+func UnicodeDecodeReplaceControlCharacters(value bool) UnicodeDecodeAttr {
+	return func(m optionalAttr) {
+		m["replace_control_characters"] = value
+	}
+}
+
+// UnicodeDecodeTsplits sets the optional Tsplits attribute to value.
+// If not specified, defaults to DT_INT64
+func UnicodeDecodeTsplits(value tf.DataType) UnicodeDecodeAttr {
+	return func(m optionalAttr) {
+		m["Tsplits"] = value
+	}
+}
+
+// Decodes each string in `input` into a sequence of Unicode code points.
+//
+// The character codepoints for all strings are returned using a single vector
+// `char_values`, with strings expanded to characters in row-major order.
+//
+// The `row_splits` tensor indicates where the codepoints for
+// each input string begin and end within the `char_values` tensor.
+// In particular, the values for the `i`th
+// string (in row-major order) are stored in the slice
+// `[row_splits[i]:row_splits[i+1]]`. Thus:
+//
+// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
+//   character in the `i`th string (in row-major order).
+// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
+//   string (in row-major order).
+//
+// Arguments:
+//	input: The text to be decoded. Can have any shape. Note that the output is flattened
+// to a vector of char values.
+//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
+// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
+//
+// Returns A 1D int32 tensor containing the row splits.A 1D int32 Tensor containing the decoded codepoints.
+func UnicodeDecode(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeAttr) (row_splits tf.Output, char_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"input_encoding": input_encoding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UnicodeDecode",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
+type ResourceApplyFtrlV2Attr func(optionalAttr)
+
+// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the Ftrl-proximal scheme.
+//
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 shrinkage regulariation. Must be a scalar.
+//
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyFtrlV2",
+		Input: []tf.Input{
+			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// OrderedMapIncompleteSizeAttr is an optional argument to OrderedMapIncompleteSize.
+type OrderedMapIncompleteSizeAttr func(optionalAttr)
+
+// OrderedMapIncompleteSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapIncompleteSizeCapacity(value int64) OrderedMapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapIncompleteSizeMemoryLimit(value int64) OrderedMapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapIncompleteSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapIncompleteSizeContainer(value string) OrderedMapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapIncompleteSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapIncompleteSizeSharedName(value string) OrderedMapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of incomplete elements in the underlying container.
+func OrderedMapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapIncompleteSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OrderedMapIncompleteSize",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
+//
+// true, this follows Python semantics in that the result here is consistent
+// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
+//
+// *NOTE*: `FloorMod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "FloorMod",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// VarHandleOpAttr is an optional argument to VarHandleOp.
+type VarHandleOpAttr func(optionalAttr)
+
+// VarHandleOpContainer sets the optional container attribute to value.
+//
+// value: the container this variable is placed in.
+// If not specified, defaults to ""
+func VarHandleOpContainer(value string) VarHandleOpAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// VarHandleOpSharedName sets the optional shared_name attribute to value.
+//
+// value: the name by which this variable is referred to.
+// If not specified, defaults to ""
+func VarHandleOpSharedName(value string) VarHandleOpAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a handle to a Variable resource.
+//
+// Arguments:
+//	dtype: the type of this variable. Must agree with the dtypes
+// of all ops using this variable.
+//	shape: The (possibly partially specified) shape of this variable.
+func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...VarHandleOpAttr) (resource tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "VarHandleOp",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// NthElementAttr is an optional argument to NthElement.
+type NthElementAttr func(optionalAttr)
+
+// NthElementReverse sets the optional reverse attribute to value.
+//
+// value: When set to True, find the nth-largest value in the vector and vice
+// versa.
+// If not specified, defaults to false
+func NthElementReverse(value bool) NthElementAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
+	}
+}
+
+// Finds values of the `n`-th order statistic for the last dimension.
+//
+// If the input is a vector (rank-1), finds the entries which is the nth-smallest
+// value in the vector and outputs their values as scalar tensor.
+//
+// For matrices (resp. higher rank input), computes the entries which is the
+// nth-smallest value in each row (resp. vector along the last dimension). Thus,
+//
+//     values.shape = input.shape[:-1]
+//
+// Arguments:
+//	input: 1-D or higher with last dimension at least `n+1`.
+//	n: 0-D. Position of sorted vector to select along the last dimension (along
+// each row for matrices). Valid range of n is `[0, input.shape[:-1])`
+//
+// Returns The `n`-th order statistic along each last dimensional slice.
+func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthElementAttr) (values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "NthElement",
+		Input: []tf.Input{
+			input, n,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// 2D fast Fourier transform.
+//
+// Computes the 2-dimensional discrete Fourier transform over the inner-most
+// 2 dimensions of `input`.
+//
+// Arguments:
+//	input: A complex tensor.
+//
+// Returns A complex tensor of the same shape as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fft2
+// @end_compatibility
+func FFT2D(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "FFT2D",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes inverse hyperbolic cosine of x element-wise.
+func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Acosh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradient for the inverse of `x` wrt its input.
+//
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func InvGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InvGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MutableHashTableOfTensorsV2Attr is an optional argument to MutableHashTableOfTensorsV2.
+type MutableHashTableOfTensorsV2Attr func(optionalAttr)
+
+// MutableHashTableOfTensorsV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableHashTableOfTensorsV2Container(value string) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MutableHashTableOfTensorsV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableHashTableOfTensorsV2SharedName(value string) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// MutableHashTableOfTensorsV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// If not specified, defaults to false
+func MutableHashTableOfTensorsV2UseNodeNameSharing(value bool) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// MutableHashTableOfTensorsV2ValueShape sets the optional value_shape attribute to value.
+// If not specified, defaults to <>
+func MutableHashTableOfTensorsV2ValueShape(value tf.Shape) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["value_shape"] = value
+	}
+}
+
+// Creates an empty hash table.
+//
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a vector. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
+//
+// Arguments:
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func MutableHashTableOfTensorsV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableOfTensorsV2Attr) (table_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MutableHashTableOfTensorsV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TextLineReaderV2Attr is an optional argument to TextLineReaderV2.
+type TextLineReaderV2Attr func(optionalAttr)
+
+// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value.
+//
+// value: Number of lines to skip from the beginning of every file.
+// If not specified, defaults to 0
+func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["skip_header_lines"] = value
+	}
+}
+
+// TextLineReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func TextLineReaderV2Container(value string) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// TextLineReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the lines of a file delimited by '\n'.
+//
+// Returns The handle to reference the Reader.
+func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TextLineReaderV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Records the latency of producing `input_dataset` elements in a StatsAggregator.
+func ExperimentalLatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalLatencyStatsDataset",
+		Input: []tf.Input{
+			input_dataset, tag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FusedBatchNormV2Attr is an optional argument to FusedBatchNormV2.
+type FusedBatchNormV2Attr func(optionalAttr)
+
+// FusedBatchNormV2Epsilon sets the optional epsilon attribute to value.
+//
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormV2Epsilon(value float32) FusedBatchNormV2Attr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
+	}
+}
+
+// FusedBatchNormV2DataFormat sets the optional data_format attribute to value.
+//
+// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormV2DataFormat(value string) FusedBatchNormV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// FusedBatchNormV2IsTraining sets the optional is_training attribute to value.
+//
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormV2IsTraining(value bool) FusedBatchNormV2Attr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Batch normalization.
+//
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+//
+// Arguments:
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
+//
+// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be reused in the gradient computation.
+func FusedBatchNormV2(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormV2Attr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FusedBatchNormV2",
+		Input: []tf.Input{
+			x, scale, offset, mean, variance,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// Debugging/model interpretability outputs for each example.
+//
+// It traverses all the trees and computes debug metrics for individual examples,
+// such as getting split feature ids and logits after each split along the decision
+// path used to compute directional feature contributions.
+//
+// Arguments:
+//
+//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
+// feature.
+//	logits_dimension: scalar, dimension of the logits, to be used for constructing the protos in
+// examples_debug_outputs_serialized.
+//
+// Returns Output rank 1 Tensor containing a proto serialized as a string for each example.
+func BoostedTreesExampleDebugOutputs(scope *Scope, tree_ensemble_handle tf.Output, bucketized_features []tf.Output, logits_dimension int64) (examples_debug_outputs_serialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesExampleDebugOutputs",
+		Input: []tf.Input{
+			tree_ensemble_handle, tf.OutputList(bucketized_features),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingADAMParametersAttr is an optional argument to RetrieveTPUEmbeddingADAMParameters.
+type RetrieveTPUEmbeddingADAMParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingADAMParametersTableId sets the optional table_id attribute to value.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
-func LoadTPUEmbeddingProximalAdagradParametersTableId(value int64) LoadTPUEmbeddingProximalAdagradParametersAttr {
+func RetrieveTPUEmbeddingADAMParametersTableId(value int64) RetrieveTPUEmbeddingADAMParametersAttr {
 	return func(m optionalAttr) {
 		m["table_id"] = value
 	}
 }
 
-// LoadTPUEmbeddingProximalAdagradParametersTableName sets the optional table_name attribute to value.
+// RetrieveTPUEmbeddingADAMParametersTableName sets the optional table_name attribute to value.
 // If not specified, defaults to ""
-func LoadTPUEmbeddingProximalAdagradParametersTableName(value string) LoadTPUEmbeddingProximalAdagradParametersAttr {
+func RetrieveTPUEmbeddingADAMParametersTableName(value string) RetrieveTPUEmbeddingADAMParametersAttr {
 	return func(m optionalAttr) {
 		m["table_name"] = value
 	}
 }
 
-// Load proximal Adagrad embedding parameters.
+// Retrieve ADAM embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the ADAM optimization algorithm.Parameter momenta updated by the ADAM optimization algorithm.Parameter velocities updated by the ADAM optimization algorithm.
+func RetrieveTPUEmbeddingADAMParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingADAMParametersAttr) (parameters tf.Output, momenta tf.Output, velocities tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingADAMParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Computes the sum along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output_i = \sum_j data_j\\) where sum is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
+// </div>
+//
+// For example:
+//
+// ```
+// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+// tf.segment_sum(c, tf.constant([0, 0, 1]))
+// # ==> [[5, 5, 5, 5],
+// #      [5, 6, 7, 8]]
+// ```
+//
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentSum",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset containing elements of first component of `input_dataset` having true in the last component.
+func FilterByLastComponentDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "FilterByLastComponentDataset",
+		Input: []tf.Input{
+			input_dataset,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// An op enabling differentiation of TPU Embeddings.
+//
+// This op simply returns its first input, which is assumed to have been sliced
+// from the Tensors returned by TPUEmbeddingDequeueActivations. The presence of
+// this op, and its first argument being a trainable Variable, enables automatic
+// differentiation of graphs containing embeddings via the TPU Embedding Python
+// libraries.
+//
+// Arguments:
+//	embedding_variable: A trainable variable, enabling optimizers to find this op.
+//	sliced_activations: The embedding activations Tensor to return.
+//	table_id: The id of the table in the embedding layer configuration from which
+// these activations were computed.
+//	lookup_id: Identifier of the set of embedding indices which produced these
+// activations.
+func TPUEmbeddingActivations(scope *Scope, embedding_variable tf.Output, sliced_activations tf.Output, table_id int64, lookup_id int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"table_id": table_id, "lookup_id": lookup_id}
+	opspec := tf.OpSpec{
+		Type: "TPUEmbeddingActivations",
+		Input: []tf.Input{
+			embedding_variable, sliced_activations,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug.
+type RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve RMSProp embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the RMSProp optimization algorithm.Parameter ms updated by the RMSProp optimization algorithm.Parameter mom updated by the RMSProp optimization algorithm.Parameter gradient_accumulators updated by the RMSProp optimization algorithm.
+func RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr) (parameters tf.Output, ms tf.Output, mom tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// RetrieveTPUEmbeddingAdagradParametersAttr is an optional argument to RetrieveTPUEmbeddingAdagradParameters.
+type RetrieveTPUEmbeddingAdagradParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingAdagradParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingAdagradParametersTableId(value int64) RetrieveTPUEmbeddingAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingAdagradParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingAdagradParametersTableName(value string) RetrieveTPUEmbeddingAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve Adagrad embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the Adagrad optimization algorithm.Parameter accumulators updated by the Adagrad optimization algorithm.
+func RetrieveTPUEmbeddingAdagradParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdagradParametersAttr) (parameters tf.Output, accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingAdagradParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Returns 0 if the denominator is zero.
+//
+//
+// *NOTE*: `DivNoNan` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func DivNoNan(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DivNoNan",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPoolWithArgmaxAttr is an optional argument to MaxPoolWithArgmax.
+type MaxPoolWithArgmaxAttr func(optionalAttr)
+
+// MaxPoolWithArgmaxTargmax sets the optional Targmax attribute to value.
+// If not specified, defaults to DT_INT64
+func MaxPoolWithArgmaxTargmax(value tf.DataType) MaxPoolWithArgmaxAttr {
+	return func(m optionalAttr) {
+		m["Targmax"] = value
+	}
+}
+
+// MaxPoolWithArgmaxIncludeBatchInIndex sets the optional include_batch_in_index attribute to value.
+//
+// value: Whether to include batch dimension in flattened index of `argmax`.
+// If not specified, defaults to false
+func MaxPoolWithArgmaxIncludeBatchInIndex(value bool) MaxPoolWithArgmaxAttr {
+	return func(m optionalAttr) {
+		m["include_batch_in_index"] = value
+	}
+}
+
+// Performs max pooling on the input and outputs both max values and indices.
+//
+// The indices in `argmax` are flattened, so that a maximum value at position
+// `[b, y, x, c]` becomes flattened index:
+// `(y * width + x) * channels + c` if `include_batch_in_index` is False;
+// `((b * height + y) * width + x) * channels + c` if `include_batch_in_index` is True.
+//
+// The indices returned are always in `[0, height) x [0, width)` before flattening,
+// even if padding is involved and the mathematically correct answer is outside
+// (either negative or too large).  This is a bug, but fixing it is difficult to do
+// in a safe backwards compatible way, especially due to flattening.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, height, width, channels]`.  Input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The max pooled output tensor.4-D.  The flattened indices of the max values chosen for each output.
+func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolWithArgmaxAttr) (output tf.Output, argmax tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolWithArgmax",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Convert the quantized 'input' tensor into a lower-precision 'output', using the
+//
+// actual distribution of the values to maximize the usage of the lower bit depth
+// and adjusting the output min and max ranges accordingly.
+//
+// [input_min, input_max] are scalar floats that specify the range for the float
+// interpretation of the 'input' data. For example, if input_min is -1.0f and
+// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
+// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+//
+// This operator tries to squeeze as much precision as possible into an output with
+// a lower bit depth by calculating the actual min and max values found in the
+// data. For example, maybe that quint16 input has no values lower than 16,384 and
+// none higher than 49,152. That means only half the range is actually needed, all
+// the float interpretations are between -0.5f and 0.5f, so if we want to compress
+// the data into a quint8 output, we can use that range rather than the theoretical
+// -1.0f to 1.0f that is suggested by the input min and max.
+//
+// In practice, this is most useful for taking output from operations like
+// QuantizedMatMul that can produce higher bit-depth outputs than their inputs and
+// may have large potential output ranges, but in practice have a distribution of
+// input values that only uses a small fraction of the possible range. By feeding
+// that output into this operator, we can reduce it from 32 bits down to 8 with
+// minimal loss of accuracy.
+//
+// Arguments:
+//
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+//
+// Returns The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	opspec := tf.OpSpec{
+		Type: "QuantizeDownAndShrinkRange",
+		Input: []tf.Input{
+			input, input_min, input_max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// ImageSummaryAttr is an optional argument to ImageSummary.
+type ImageSummaryAttr func(optionalAttr)
+
+// ImageSummaryMaxImages sets the optional max_images attribute to value.
+//
+// value: Max number of batch elements to generate images for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
+	return func(m optionalAttr) {
+		m["max_images"] = value
+	}
+}
+
+// ImageSummaryBadColor sets the optional bad_color attribute to value.
+//
+// value: Color to use for pixels with non-finite values.
+// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
+func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
+	return func(m optionalAttr) {
+		m["bad_color"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with images.
+//
+// The summary has up to `max_images` summary values containing images. The
+// images are built from `tensor` which must be 4-D with shape `[batch_size,
+// height, width, channels]` and where `channels` can be:
+//
+// *  1: `tensor` is interpreted as Grayscale.
+// *  3: `tensor` is interpreted as RGB.
+// *  4: `tensor` is interpreted as RGBA.
+//
+// The images have the same number of channels as the input tensor. For float
+// input, the values are normalized one image at a time to fit in the range
+// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
+// normalization algorithms:
+//
+// *  If the input values are all positive, they are rescaled so the largest one
+//    is 255.
+//
+// *  If any input value is negative, the values are shifted so input value 0.0
+//    is at 127.  They are then rescaled so that either the smallest value is 0,
+//    or the largest one is 255.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
+// *  If `max_images` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+//
+// The `bad_color` argument is the color to use in the generated images for
+// non-finite input values.  It is a `uint8` 1-D tensor of length `channels`.
+// Each element must be in the range `[0, 255]` (It represents the value of a
+// pixel in the output image).  Non-finite values in the input tensor are
+// replaced by this tensor in the output image.  The default value is the color
+// red.
+//
+// Arguments:
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
+// `channels` is 1, 3, or 4.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...ImageSummaryAttr) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ImageSummary",
+		Input: []tf.Input{
+			tag, tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// BiasAddGradAttr is an optional argument to BiasAddGrad.
+type BiasAddGradAttr func(optionalAttr)
+
+// BiasAddGradDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the bias tensor will be added to the last dimension
+// of the value tensor.
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// The tensor will be added to "in_channels", the third-to-the-last
+//     dimension.
+// If not specified, defaults to "NHWC"
+func BiasAddGradDataFormat(value string) BiasAddGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// The backward operation for "BiasAdd" on the "bias" tensor.
+//
+// It accumulates all the values from out_backprop into the feature dimension.
+// For NHWC data format, the feature dimension is the last. For NCHW data format,
+// the feature dimension is the third-to-last.
+//
+// Arguments:
+//	out_backprop: Any number of dimensions.
+//
+// Returns 1-D with size the feature dimension of `out_backprop`.
+func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "BiasAddGrad",
+		Input: []tf.Input{
+			out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Scatter the data from the input value into specific TensorArray elements.
+//
+// `indices` must be a vector, its length must match the first dim of `value`.
+//
+// Arguments:
+//	handle: The handle to a TensorArray.
+//	indices: The locations at which to write the tensor elements.
+//	value: The concatenated tensor to write to the TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//
+// Returns A float scalar that enforces proper chaining of operations.
+func TensorArrayScatterV3(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayScatterV3",
+		Input: []tf.Input{
+			handle, indices, value, flow_in,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the number of elements in the given table.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//
+// Returns Scalar that contains number of elements in the table.
+func LookupTableSizeV2(scope *Scope, table_handle tf.Output) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LookupTableSizeV2",
+		Input: []tf.Input{
+			table_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes softmax activations.
+//
+// For each batch `i` and class `j` we have
+//
+//     $$softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))$$
+//
+// Arguments:
+//	logits: 2-D with shape `[batch_size, num_classes]`.
+//
+// Returns Same shape as `logits`.
+func Softmax(scope *Scope, logits tf.Output) (softmax tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Softmax",
+		Input: []tf.Input{
+			logits,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingADAMParametersAttr is an optional argument to LoadTPUEmbeddingADAMParameters.
+type LoadTPUEmbeddingADAMParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingADAMParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingADAMParametersTableId(value int64) LoadTPUEmbeddingADAMParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingADAMParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingADAMParametersTableName(value string) LoadTPUEmbeddingADAMParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load ADAM embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the ADAM optimization algorithm.
+//	momenta: Value of momenta used in the ADAM optimization algorithm.
+//	velocities: Value of velocities used in the ADAM optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingADAMParameters(scope *Scope, parameters tf.Output, momenta tf.Output, velocities tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingADAMParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingADAMParameters",
+		Input: []tf.Input{
+			parameters, momenta, velocities,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Interleave the values from the `data` tensors into a single tensor.
+//
+// Builds a merged tensor such that
+//
+// ```python
+//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
+// ```
+//
+// For example, if each `indices[m]` is scalar or vector, we have
+//
+// ```python
+//     # Scalar indices:
+//     merged[indices[m], ...] = data[m][...]
+//
+//     # Vector indices:
+//     merged[indices[m][i], ...] = data[m][i, ...]
+// ```
+//
+// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
+// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
+// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
+// `constant`, the output shape is
+//
+//     merged.shape = [max(indices)] + constant
+//
+// Values may be merged in parallel, so if an index appears in both `indices[m][i]`
+// and `indices[n][j]`, the result may be invalid. This differs from the normal
+// DynamicStitch operator that defines the behavior in that case.
+//
+// For example:
+//
+// ```python
+//     indices[0] = 6
+//     indices[1] = [4, 1]
+//     indices[2] = [[5, 2], [0, 3]]
+//     data[0] = [61, 62]
+//     data[1] = [[41, 42], [11, 12]]
+//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
+//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
+//               [51, 52], [61, 62]]
+// ```
+//
+// This method can be used to merge partitions created by `dynamic_partition`
+// as illustrated on the following example:
+//
+// ```python
+//     # Apply function (increments x_i) on elements for which a certain condition
+//     # apply (x_i != -1 in this example).
+//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
+//     condition_mask=tf.not_equal(x,tf.constant(-1.))
+//     partitioned_data = tf.dynamic_partition(
+//         x, tf.cast(condition_mask, tf.int32) , 2)
+//     partitioned_data[1] = partitioned_data[1] + 1.0
+//     condition_indices = tf.dynamic_partition(
+//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
+//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
+//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
+//     # unchanged.
+// ```
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
+// </div>
+func ParallelDynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ParallelDynamicStitch",
+		Input: []tf.Input{
+			tf.OutputList(indices), tf.OutputList(data),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug.
+type LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load proximal Adagrad embedding parameters with debug support.
 //
 // An op that loads optimization parameters into HBM for embedding. Must be
 // preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
@@ -17529,11 +23179,12 @@ func LoadTPUEmbeddingProximalAdagradParametersTableName(value string) LoadTPUEmb
 // Arguments:
 //	parameters: Value of parameters used in the proximal Adagrad optimization algorithm.
 //	accumulators: Value of accumulators used in the proximal Adagrad optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the proximal Adagrad optimization algorithm.
 //
 //
 //
 // Returns the created operation.
-func LoadTPUEmbeddingProximalAdagradParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingProximalAdagradParametersAttr) (o *tf.Operation) {
+func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17542,107 +23193,96 @@ func LoadTPUEmbeddingProximalAdagradParameters(scope *Scope, parameters tf.Outpu
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingProximalAdagradParameters",
+		Type: "LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug",
 		Input: []tf.Input{
-			parameters, accumulators,
+			parameters, accumulators, gradient_accumulators,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Returns which elements of x are Inf.
+// Store the input tensor in the state of the current session.
 //
-// @compatibility(numpy)
-// Equivalent to np.isinf
-// @end_compatibility
-func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
+// Arguments:
+//	value: The tensor to be stored.
+//
+// Returns The handle for the tensor stored in the session state, represented
+// as a ResourceHandle object.
+func GetSessionHandleV2(scope *Scope, value tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IsInf",
+		Type: "GetSessionHandleV2",
 		Input: []tf.Input{
-			x,
+			value,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deprecated, use python implementation tf.linalg.matrix_exponential.
+// ResourceScatterNdAddAttr is an optional argument to ResourceScatterNdAdd.
+type ResourceScatterNdAddAttr func(optionalAttr)
+
+// ResourceScatterNdAddUseLocking sets the optional use_locking attribute to value.
 //
-// DEPRECATED at GraphDef version 27: Use Python implementation tf.linalg.matrix_exponential instead.
-func MatrixExponential(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixExponential",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Does nothing. Only useful as a placeholder for control edges.
-//
-// Returns the created operation.
-func NoOp(scope *Scope) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "NoOp",
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes softplus: `log(exp(features) + 1)`.
-func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Softplus",
-		Input: []tf.Input{
-			features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyProximalGradientDescentAttr is an optional argument to ResourceApplyProximalGradientDescent.
-type ResourceApplyProximalGradientDescentAttr func(optionalAttr)
-
-// ResourceApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyProximalGradientDescentUseLocking(value bool) ResourceApplyProximalGradientDescentAttr {
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdAddUseLocking(value bool) ResourceScatterNdAddAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' as FOBOS algorithm with fixed learning rate.
+// Applies sparse addition to individual values or slices in a Variable.
 //
-// prox_v = var - alpha * delta
-// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// ```
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
+// ```
+//
+// For example, say we want to add 4 scattered elements to a rank-1 tensor to
+// 8 elements. In Python, that addition would look like this:
+//
+// ```python
+// ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
+// indices = tf.constant([[4], [3], [1], [7]])
+// updates = tf.constant([9, 10, 11, 12])
+// add = tf.scatter_nd_add(ref, indices, updates)
+// with tf.Session() as sess:
+//   print sess.run(add)
+// ```
+//
+// The resulting update to ref would look like this:
+//
+//     [1, 13, 3, 14, 14, 6, 7, 20]
+//
+// See `tf.scatter_nd` for more details about how to make updates to
+// slices.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	delta: The change.
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of
+// values to add to ref.
 //
 // Returns the created operation.
-func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, delta tf.Output, optional ...ResourceApplyProximalGradientDescentAttr) (o *tf.Operation) {
+func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdAddAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17651,9 +23291,435 @@ func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyProximalGradientDescent",
+		Type: "ResourceScatterNdAdd",
 		Input: []tf.Input{
-			var_, alpha, l1, l2, delta,
+			ref, indices, updates,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
+type ResourceApplyPowerSignAttr func(optionalAttr)
+
+// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the AddSign update.
+//
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
+// variable <- variable - lr_t * update
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	logbase: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyPowerSign",
+		Input: []tf.Input{
+			var_, m, lr, logbase, sign_decay, beta, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
+type StatelessRandomUniformAttr func(optionalAttr)
+
+// StatelessRandomUniformDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom random values from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns Random values with specified shape.
+func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomUniform",
+		Input: []tf.Input{
+			shape, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Checks whether a resource handle-based variable has been initialized.
+//
+// Arguments:
+//	resource: the input resource handle.
+//
+// Returns a scalar boolean which is true if the variable has been
+// initialized.
+func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "VarIsInitializedOp",
+		Input: []tf.Input{
+			resource,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deprecated. Use TensorArrayGradV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArrayWriteV3
+func TensorArrayWriteV2(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayWriteV2",
+		Input: []tf.Input{
+			handle, index, value, flow_in,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+//   This op is used as a placeholder in If branch functions. It doesn't provide a
+//   valid output when run, so must either be removed (e.g. replaced with a
+//   function input) or guaranteed not to be used (e.g. if mirroring an
+//   intermediate output needed for the gradient computation of the other branch).
+//
+// Arguments:
+//	dtype: The type of the output.
+//	shape:     The purported shape of the output. This is only used for shape inference;
+//     the output will not necessarily have this shape. Can be a partial shape.
+//
+// Returns     \"Fake\" output value. This should not be consumed by another op.
+func FakeParam(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	opspec := tf.OpSpec{
+		Type: "FakeParam",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// UnicodeDecodeWithOffsetsAttr is an optional argument to UnicodeDecodeWithOffsets.
+type UnicodeDecodeWithOffsetsAttr func(optionalAttr)
+
+// UnicodeDecodeWithOffsetsErrors sets the optional errors attribute to value.
+//
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeDecodeWithOffsetsErrors(value string) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["errors"] = value
+	}
+}
+
+// UnicodeDecodeWithOffsetsReplacementChar sets the optional replacement_char attribute to value.
+//
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD or U+65533.)
+// If not specified, defaults to 65533
+func UnicodeDecodeWithOffsetsReplacementChar(value int64) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["replacement_char"] = value
+	}
+}
+
+// UnicodeDecodeWithOffsetsReplaceControlCharacters sets the optional replace_control_characters attribute to value.
+//
+// value: Whether to replace the C0 control characters (00-1F) with the
+// `replacement_char`. Default is false.
+// If not specified, defaults to false
+func UnicodeDecodeWithOffsetsReplaceControlCharacters(value bool) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["replace_control_characters"] = value
+	}
+}
+
+// UnicodeDecodeWithOffsetsTsplits sets the optional Tsplits attribute to value.
+// If not specified, defaults to DT_INT64
+func UnicodeDecodeWithOffsetsTsplits(value tf.DataType) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["Tsplits"] = value
+	}
+}
+
+// Decodes each string in `input` into a sequence of Unicode code points.
+//
+// The character codepoints for all strings are returned using a single vector
+// `char_values`, with strings expanded to characters in row-major order.
+// Similarly, the character start byte offsets are returned using a single vector
+// `char_to_byte_starts`, with strings expanded in row-major order.
+//
+// The `row_splits` tensor indicates where the codepoints and start offsets for
+// each input string begin and end within the `char_values` and
+// `char_to_byte_starts` tensors.  In particular, the values for the `i`th
+// string (in row-major order) are stored in the slice
+// `[row_splits[i]:row_splits[i+1]]`. Thus:
+//
+// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
+//   character in the `i`th string (in row-major order).
+// * `char_to_bytes_starts[row_splits[i]+j]` is the start byte offset for the `j`th
+//   character in the `i`th string (in row-major order).
+// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
+//   string (in row-major order).
+//
+// Arguments:
+//	input: The text to be decoded. Can have any shape. Note that the output is flattened
+// to a vector of char values.
+//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
+// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
+//
+// Returns A 1D int32 tensor containing the row splits.A 1D int32 Tensor containing the decoded codepoints.A 1D int32 Tensor containing the byte index in the input string where each
+// character in `char_values` starts.
+func UnicodeDecodeWithOffsets(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeWithOffsetsAttr) (row_splits tf.Output, char_values tf.Output, char_to_byte_starts tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"input_encoding": input_encoding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UnicodeDecodeWithOffsets",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Computes inverse hyperbolic tangent of x element-wise.
+func Atanh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Atanh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FusedBatchNormGradV2Attr is an optional argument to FusedBatchNormGradV2.
+type FusedBatchNormGradV2Attr func(optionalAttr)
+
+// FusedBatchNormGradV2Epsilon sets the optional epsilon attribute to value.
+//
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormGradV2Epsilon(value float32) FusedBatchNormGradV2Attr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
+	}
+}
+
+// FusedBatchNormGradV2DataFormat sets the optional data_format attribute to value.
+//
+// value: The data format for y_backprop, x, x_backprop.
+// Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormGradV2DataFormat(value string) FusedBatchNormGradV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// FusedBatchNormGradV2IsTraining sets the optional is_training attribute to value.
+//
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormGradV2IsTraining(value bool) FusedBatchNormGradV2Attr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Gradient for batch normalization.
+//
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+//
+// Arguments:
+//	y_backprop: A 4D Tensor for the gradient with respect to y.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
+// mean to be reused in gradient computation. When is_training is
+// False, a 1D Tensor for the population mean to be reused in both
+// 1st and 2nd order gradient computation.
+//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
+// variance (inverted variance in the cuDNN case) to be reused in
+// gradient computation. When is_training is False, a 1D Tensor
+// for the population variance to be reused in both 1st and 2nd
+// order gradient computation.
+//
+// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
+// in FusedBatchNorm.
+func FusedBatchNormGradV2(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradV2Attr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FusedBatchNormGradV2",
+		Input: []tf.Input{
+			y_backprop, x, scale, reserve_space_1, reserve_space_2,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// Divides sparse updates into the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] /= updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] /= updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] /= updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions multiply.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterDiv(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterDiv",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
+type DestroyResourceOpAttr func(optionalAttr)
+
+// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
+//
+// value: whether to ignore the error when the resource
+// doesn't exist.
+// If not specified, defaults to true
+func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
+	return func(m optionalAttr) {
+		m["ignore_lookup_error"] = value
+	}
+}
+
+// Deletes the resource specified by the handle.
+//
+// All subsequent operations using the resource will result in a NotFound
+// error status.
+//
+// Arguments:
+//	resource: handle to the resource to delete.
+//
+// Returns the created operation.
+func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DestroyResourceOp",
+		Input: []tf.Input{
+			resource,
 		},
 		Attrs: attrs,
 	}
@@ -17721,49 +23787,7133 @@ func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Outp
 	return op.Output(0)
 }
 
-// VarHandleOpAttr is an optional argument to VarHandleOp.
-type VarHandleOpAttr func(optionalAttr)
-
-// VarHandleOpContainer sets the optional container attribute to value.
+// Returns a constant tensor on the host. Only for writing C++ tests.
 //
-// value: the container this variable is placed in.
+// Arguments:
+//	value: Attr `value` is the tensor to return.
+//
+func HostConst(scope *Scope, value tf.Tensor, dtype tf.DataType) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"value": value, "dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "HostConst",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Concats all tensors in the list along the 0th dimension.
+//
+// Requires that all tensors have the same shape except the first dimension.
+//
+// input_handle: The input list.
+// element_shape: The shape of the uninitialized elements in the list. If the first
+//   dimension is not -1, it is assumed that all list elements have the same
+//   leading dim.
+// leading_dims: The list of leading dims of uninitialized list elements. Used if
+//   the leading dim of input_handle.element_shape or the element_shape input arg
+//   is not already set.
+// tensor: The concated result.
+// lengths: Output tensor containing sizes of the 0th dimension of tensors in the list, used for computing the gradient.
+//
+func TensorListConcatV2(scope *Scope, input_handle tf.Output, element_shape tf.Output, leading_dims tf.Output, element_dtype tf.DataType) (tensor tf.Output, lengths tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListConcatV2",
+		Input: []tf.Input{
+			input_handle, element_shape, leading_dims,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// LoadTPUEmbeddingStochasticGradientDescentParametersAttr is an optional argument to LoadTPUEmbeddingStochasticGradientDescentParameters.
+type LoadTPUEmbeddingStochasticGradientDescentParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingStochasticGradientDescentParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingStochasticGradientDescentParametersTableId(value int64) LoadTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingStochasticGradientDescentParametersTableName sets the optional table_name attribute to value.
 // If not specified, defaults to ""
-func VarHandleOpContainer(value string) VarHandleOpAttr {
+func LoadTPUEmbeddingStochasticGradientDescentParametersTableName(value string) LoadTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load SGD embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the stochastic gradient descent optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingStochasticGradientDescentParameters(scope *Scope, parameters tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingStochasticGradientDescentParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingStochasticGradientDescentParameters",
+		Input: []tf.Input{
+			parameters,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// SparseToDenseAttr is an optional argument to SparseToDense.
+type SparseToDenseAttr func(optionalAttr)
+
+// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
+//
+// value: If true, indices are checked to make sure they are sorted in
+// lexicographic order and that there are no repeats.
+// If not specified, defaults to true
+func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Converts a sparse representation into a dense tensor.
+//
+// Builds an array `dense` with shape `output_shape` such that
+//
+// ```
+// # If sparse_indices is scalar
+// dense[i] = (i == sparse_indices ? sparse_values : default_value)
+//
+// # If sparse_indices is a vector, then for each i
+// dense[sparse_indices[i]] = sparse_values[i]
+//
+// # If sparse_indices is an n by d matrix, then for each i in [0, n)
+// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
+// ```
+//
+// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
+// scalar, all sparse indices are set to this single value.
+//
+// Indices should be sorted in lexicographic order, and indices must not
+// contain any repeats. If `validate_indices` is true, these properties
+// are checked during execution.
+//
+// Arguments:
+//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
+// index where `sparse_values[i]` will be placed.
+//	output_shape: 1-D.  Shape of the dense output tensor.
+//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
+// or a scalar value to be used for all sparse indices.
+//	default_value: Scalar value to set for indices not specified in
+// `sparse_indices`.
+//
+// Returns Dense output tensor of shape `output_shape`.
+func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseToDense",
+		Input: []tf.Input{
+			sparse_indices, output_shape, sparse_values, default_value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CastAttr is an optional argument to Cast.
+type CastAttr func(optionalAttr)
+
+// CastTruncate sets the optional Truncate attribute to value.
+// If not specified, defaults to false
+func CastTruncate(value bool) CastAttr {
+	return func(m optionalAttr) {
+		m["Truncate"] = value
+	}
+}
+
+// Cast x of type SrcT to y of DstT.
+func Cast(scope *Scope, x tf.Output, DstT tf.DataType, optional ...CastAttr) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"DstT": DstT}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Cast",
+		Input: []tf.Input{
+			x,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
+type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
+
+// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
+//
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// prox_v = var
+// prox_v -= lr * grad * (1 / sqrt(accum))
+// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyProximalAdagrad",
+		Input: []tf.Input{
+			var_, accum, lr, l1, l2, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Elementwise computes the bitwise XOR of `x` and `y`.
+//
+// The result will have those bits set, that are different in `x` and `y`. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseXor(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BitwiseXor",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RandomUniformAttr is an optional argument to RandomUniform.
+type RandomUniformAttr func(optionalAttr)
+
+// RandomUniformSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomUniformSeed(value int64) RandomUniformAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomUniformSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformSeed2(value int64) RandomUniformAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
+//
+// Returns A tensor of the specified shape filled with uniform random values.
+func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomUniform",
+		Input: []tf.Input{
+			shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TensorSummaryAttr is an optional argument to TensorSummary.
+type TensorSummaryAttr func(optionalAttr)
+
+// TensorSummaryDescription sets the optional description attribute to value.
+//
+// value: A json-encoded SummaryDescription proto.
+// If not specified, defaults to ""
+func TensorSummaryDescription(value string) TensorSummaryAttr {
+	return func(m optionalAttr) {
+		m["description"] = value
+	}
+}
+
+// TensorSummaryLabels sets the optional labels attribute to value.
+//
+// value: An unused list of strings.
+// If not specified, defaults to <>
+func TensorSummaryLabels(value []string) TensorSummaryAttr {
+	return func(m optionalAttr) {
+		m["labels"] = value
+	}
+}
+
+// TensorSummaryDisplayName sets the optional display_name attribute to value.
+//
+// value: An unused string.
+// If not specified, defaults to ""
+func TensorSummaryDisplayName(value string) TensorSummaryAttr {
+	return func(m optionalAttr) {
+		m["display_name"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with a tensor.
+//
+// This op is being phased out in favor of TensorSummaryV2, which lets callers pass
+// a tag as well as a serialized SummaryMetadata proto string that contains
+// plugin-specific data. We will keep this op to maintain backwards compatibility.
+//
+// Arguments:
+//	tensor: A tensor to serialize.
+func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorSummary",
+		Input: []tf.Input{
+			tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
+type ResourceApplyAdadeltaAttr func(optionalAttr)
+
+// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var, accum and update_accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the adadelta scheme.
+//
+// accum = rho() * accum + (1 - rho()) * grad.square();
+// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
+// update_accum = rho() * update_accum + (1 - rho()) * update.square();
+// var -= update;
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	accum_update: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdadelta",
+		Input: []tf.Input{
+			var_, accum, accum_update, lr, rho, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Resizes the list.
+//
+//
+// input_handle: the input list
+// size: size of the output list
+//
+func TensorListResize(scope *Scope, input_handle tf.Output, size tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListResize",
+		Input: []tf.Input{
+			input_handle, size,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Gradients for batch normalization.
+//
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+//
+// This op is deprecated. See `tf.nn.batch_normalization`.
+//
+// Arguments:
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this Tensor will be multiplied
+// with the normalized Tensor.
+//	backprop: 4D backprop Tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+//
+// Returns 4D backprop tensor for input.1D backprop tensor for mean.1D backprop tensor for variance.1D backprop tensor for beta.1D backprop tensor for gamma.
+func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	opspec := tf.OpSpec{
+		Type: "BatchNormWithGlobalNormalizationGrad",
+		Input: []tf.Input{
+			t, m, v, gamma, backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// InfeedEnqueueAttr is an optional argument to InfeedEnqueue.
+type InfeedEnqueueAttr func(optionalAttr)
+
+// InfeedEnqueueShape sets the optional shape attribute to value.
+//
+// value: The shape of the tensor.
+// If not specified, defaults to <>
+func InfeedEnqueueShape(value tf.Shape) InfeedEnqueueAttr {
+	return func(m optionalAttr) {
+		m["shape"] = value
+	}
+}
+
+// InfeedEnqueueLayout sets the optional layout attribute to value.
+//
+// value: A vector holding the requested layout in minor-to-major sequence.
+// If a layout attribute is passed, but its values are all -1, the layout will
+// be computed by the infeed operation.
+// If not specified, defaults to <>
+func InfeedEnqueueLayout(value []int64) InfeedEnqueueAttr {
+	return func(m optionalAttr) {
+		m["layout"] = value
+	}
+}
+
+// InfeedEnqueueDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. This should be -1 when the Op
+// is running on a TPU device, and >= 0 when the Op is running on the CPU
+// device.
+// If not specified, defaults to -1
+func InfeedEnqueueDeviceOrdinal(value int64) InfeedEnqueueAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// An op which feeds a single Tensor value into the computation.
+//
+// Arguments:
+//	input: A tensor that will be provided using the infeed mechanism.
+//
+// Returns the created operation.
+func InfeedEnqueue(scope *Scope, input tf.Output, optional ...InfeedEnqueueAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "InfeedEnqueue",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes the Bessel i1e function of `x` element-wise.
+//
+// Exponentially scaled modified Bessel function of order 0 defined as
+// `bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
+//
+// This function is faster and numerically stabler than `bessel_i1(x)`.
+func BesselI1e(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BesselI1e",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
+type MatrixSolveLsAttr func(optionalAttr)
+
+// MatrixSolveLsFast sets the optional fast attribute to value.
+// If not specified, defaults to true
+func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
+	return func(m optionalAttr) {
+		m["fast"] = value
+	}
+}
+
+// Solves one or more linear least-squares problems.
+//
+// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
+// type as `matrix` and shape `[..., M, K]`.
+// The output is a tensor shape `[..., N, K]` where each output matrix solves
+// each of the equations
+// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
+// in the least squares sense.
+//
+// We use the following notation for (complex) matrix and right-hand sides
+// in the batch:
+//
+// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
+// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
+// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
+// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
+//
+// If `fast` is `True`, then the solution is computed by solving the normal
+// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
+// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
+// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 + \lambda ||Z||_F^2\\).
+// If \\(m \lt n\\) then `output` is computed as
+// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
+// minimum-norm solution to the under-determined linear system, i.e.
+// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
+// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
+// when \\(A\\) is numerically full rank and has a condition number
+// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or \\(\lambda\\) is
+// sufficiently large.
+//
+// If `fast` is `False` an algorithm based on the numerically robust complete
+// orthogonal decomposition is used. This computes the minimum-norm
+// least-squares solution, even when \\(A\\) is rank deficient. This path is
+// typically 6-7 times slower than the fast path. If `fast` is `False` then
+// `l2_regularizer` is ignored.
+//
+// Arguments:
+//	matrix: Shape is `[..., M, N]`.
+//	rhs: Shape is `[..., M, K]`.
+//	l2_regularizer: Scalar tensor.
+//
+// @compatibility(numpy)
+// Equivalent to np.linalg.lstsq
+// @end_compatibility
+//
+// Returns Shape is `[..., N, K]`.
+func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixSolveLs",
+		Input: []tf.Input{
+			matrix, rhs, l2_regularizer,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Produces the max pool of the input tensor for quantized types.
+//
+// Arguments:
+//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	ksize: The size of the window for each dimension of the input tensor.
+// The length must be 4 to match the number of dimensions of the input.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. The length must be 4 to match the number of dimensions of the input.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "QuantizedMaxPool",
+		Input: []tf.Input{
+			input, min_input, max_input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
+type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
+
+// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Learning rate. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyAdagradDA",
+		Input: []tf.Input{
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Determine the script codes of a given tensor of Unicode integer code points.
+//
+// This operation converts Unicode code points to script codes corresponding to
+// each code point. Script codes correspond to International Components for
+// Unicode (ICU) UScriptCode values. See http://icu-project.org/apiref/icu4c/uscript_8h.html.
+// Returns -1 (USCRIPT_INVALID_CODE) for invalid codepoints. Output shape will
+// match input shape.
+//
+// Arguments:
+//	input: A Tensor of int32 Unicode code points.
+//
+// Returns A Tensor of int32 script codes corresponding to each input code point.
+func UnicodeScript(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnicodeScript",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AudioSummaryAttr is an optional argument to AudioSummary.
+type AudioSummaryAttr func(optionalAttr)
+
+// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
+//
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
+	return func(m optionalAttr) {
+		m["max_outputs"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with audio.
+//
+// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
+//
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+//
+// Arguments:
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"sample_rate": sample_rate}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AudioSummary",
+		Input: []tf.Input{
+			tag, tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Output a fact about factorials.
+func Fact(scope *Scope) (fact tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Fact",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// VariableShapeAttr is an optional argument to VariableShape.
+type VariableShapeAttr func(optionalAttr)
+
+// VariableShapeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func VariableShapeOutType(value tf.DataType) VariableShapeAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Returns the shape of the variable pointed to by `resource`.
+//
+// This operation returns a 1-D integer tensor representing the shape of `input`.
+//
+// For example:
+//
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// shape(t) ==> [2, 2, 3]
+// ```
+func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "VariableShape",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs deterministic pseudorandom random integers from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[minval, maxval)`.
+//
+// The outputs are a deterministic function of `shape`, `seed`, `minval`, and `maxval`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//	minval: Minimum value (inclusive, scalar).
+//	maxval: Maximum value (exclusive, scalar).
+//
+// Returns Random values with specified shape.
+func StatelessRandomUniformInt(scope *Scope, shape tf.Output, seed tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomUniformInt",
+		Input: []tf.Input{
+			shape, seed, minval, maxval,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// PrelinearizeTupleAttr is an optional argument to PrelinearizeTuple.
+type PrelinearizeTupleAttr func(optionalAttr)
+
+// PrelinearizeTupleLayouts sets the optional layouts attribute to value.
+//
+// value: A vector holding the requested layout in minor-to-major sequence for all the
+// tuple shapes in the order the shapes appear in the "shapes" input. The layout
+// elements for a sub-shape can be set to -1 in which case the corresponding layout
+// will be computed by the infeed operation.
+// If not specified, defaults to <>
+func PrelinearizeTupleLayouts(value []int64) PrelinearizeTupleAttr {
+	return func(m optionalAttr) {
+		m["layouts"] = value
+	}
+}
+
+// An op which linearizes multiple Tensor values to an opaque variant tensor.
+//
+// Arguments:
+//	inputs: A list of tensors that will be provided using the infeed mechanism.
+//	shapes: The shapes of each tensor in `inputs`.
+func PrelinearizeTuple(scope *Scope, inputs []tf.Output, shapes []tf.Shape, optional ...PrelinearizeTupleAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shapes": shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "PrelinearizeTuple",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a MultiDeviceIterator resource.
+//
+// Arguments:
+//	devices: A list of devices the iterator works across.
+//	shared_name: If non-empty, this resource will be shared under the given name
+// across multiple sessions.
+//	container: If non-empty, this resource is placed in the given container.
+// Otherwise, a default container is used.
+//	output_types: The type list for the return values.
+//	output_shapes: The list of shapes being produced.
+//
+// Returns Handle to the resource created.
+func MultiDeviceIterator(scope *Scope, devices []string, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"devices": devices, "shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "MultiDeviceIterator",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes rectified linear 6: `min(max(features, 0), 6)`.
+func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Relu6",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Conv3DAttr is an optional argument to Conv3D.
+type Conv3DAttr func(optionalAttr)
+
+// Conv3DDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DDataFormat(value string) Conv3DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv3DDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DDilations(value []int64) Conv3DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 3-D convolution given 5-D `input` and `filter` tensors.
+//
+// In signal processing, cross-correlation is a measure of similarity of
+// two waveforms as a function of a time-lag applied to one of them. This
+// is also known as a sliding dot product or sliding inner-product.
+//
+// Our Conv3D implements a form of cross-correlation.
+//
+// Arguments:
+//	input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
+//	filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
+// out_channels]`. `in_channels` must match between `input` and `filter`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv3DAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv3D",
+		Input: []tf.Input{
+			input, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SdcaOptimizerV2Attr is an optional argument to SdcaOptimizerV2.
+type SdcaOptimizerV2Attr func(optionalAttr)
+
+// SdcaOptimizerV2Adaptive sets the optional adaptive attribute to value.
+//
+// value: Whether to use Adaptive SDCA for the inner loop.
+// If not specified, defaults to true
+func SdcaOptimizerV2Adaptive(value bool) SdcaOptimizerV2Attr {
+	return func(m optionalAttr) {
+		m["adaptive"] = value
+	}
+}
+
+// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
+//
+// linear models with L1 + L2 regularization. As global optimization objective is
+// strongly-convex, the optimizer optimizes the dual objective at each step. The
+// optimizer applies each update one example at a time. Examples are sampled
+// uniformly, and the optimizer is learning rate free and enjoys linear convergence
+// rate.
+//
+// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
+// Shai Shalev-Shwartz, Tong Zhang. 2012
+//
+// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
+//
+// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
+// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
+// Peter Richtarik, Martin Takac. 2015
+//
+// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
+// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
+//
+// Arguments:
+//	sparse_example_indices: a list of vectors which contain example indices.
+//	sparse_feature_indices: a list of vectors which contain feature indices.
+//	sparse_feature_values: a list of vectors which contains feature value
+// associated with each feature group.
+//	dense_features: a list of matrices which contains the dense feature values.
+//	example_weights: a vector which contains the weight associated with each
+// example.
+//	example_labels: a vector which contains the label/target associated with each
+// example.
+//	sparse_indices: a list of vectors where each value is the indices which has
+// corresponding weights in sparse_weights. This field maybe omitted for the
+// dense approach.
+//	sparse_weights: a list of vectors where each value is the weight associated with
+// a sparse feature group.
+//	dense_weights: a list of vectors where the values are the weights associated
+// with a dense feature group.
+//	example_state_data: a list of vectors containing the example state data.
+//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
+// squared and hinge losses.
+//	l1: Symmetric l1 regularization strength.
+//	l2: Symmetric l2 regularization strength.
+//	num_loss_partitions: Number of partitions of the global loss function.
+//	num_inner_iterations: Number of iterations per mini-batch.
+//
+// Returns a list of vectors containing the updated example state
+// data.a list of vectors where each value is the delta
+// weights associated with a sparse feature group.a list of vectors where the values are the delta
+// weights associated with a dense feature group.
+func SdcaOptimizerV2(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerV2Attr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SdcaOptimizerV2",
+		Input: []tf.Input{
+			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	out_example_state_data = op.Output(idx)
+	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizerV2", err)
+		return
+	}
+	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizerV2", err)
+		return
+	}
+	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
+}
+
+// CudnnRNNBackpropV2Attr is an optional argument to CudnnRNNBackpropV2.
+type CudnnRNNBackpropV2Attr func(optionalAttr)
+
+// CudnnRNNBackpropV2RnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNBackpropV2RnnMode(value string) CudnnRNNBackpropV2Attr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNBackpropV2InputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNBackpropV2InputMode(value string) CudnnRNNBackpropV2Attr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNBackpropV2Direction sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNBackpropV2Direction(value string) CudnnRNNBackpropV2Attr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNBackpropV2Dropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropV2Dropout(value float32) CudnnRNNBackpropV2Attr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNBackpropV2Seed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropV2Seed(value int64) CudnnRNNBackpropV2Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNBackpropV2Seed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropV2Seed2(value int64) CudnnRNNBackpropV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Backprop step of CudnnRNN.
+//
+// Compute the backprop of both data and weights in a RNN. Takes an extra
+//     "host_reserved" inupt than CudnnRNNBackprop, which is used to determine RNN
+//     cudnnRNNAlgo_t and cudnnMathType_t.
+//
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicates whether there is a linear projection between the input and
+//     the actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// output: A 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
+// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
+//     pass.
+// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
+//     pass.
+// reserve_space: The same reserve_space produced in the forward operation.
+// host_reserved: The same host_reserved produced in the forward operation.
+// input_backprop: The backprop to input in the forward pass. Has the same shape
+//     as input.
+// input_h_backprop: The backprop to input_h in the forward pass. Has the same
+//     shape as input_h.
+// input_c_backprop: The backprop to input_c in the forward pass. Has the same
+//     shape as input_c.
+// params_backprop: The backprop to the params buffer in the forward pass. Has the
+//     same shape as params.
+func CudnnRNNBackpropV2(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, host_reserved tf.Output, optional ...CudnnRNNBackpropV2Attr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNNBackpropV2",
+		Input: []tf.Input{
+			input, input_h, input_c, params, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space, host_reserved,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// Inverse 2D fast Fourier transform.
+//
+// Computes the inverse 2-dimensional discrete Fourier transform over the
+// inner-most 2 dimensions of `input`.
+//
+// Arguments:
+//	input: A complex tensor.
+//
+// Returns A complex tensor of the same shape as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their inverse 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft2
+// @end_compatibility
+func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IFFT2D",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StackV2Attr is an optional argument to StackV2.
+type StackV2Attr func(optionalAttr)
+
+// StackV2StackName sets the optional stack_name attribute to value.
+//
+// value: Overrides the name used for the temporary stack resource. Default
+// value is the name of the 'Stack' op (which is guaranteed unique).
+// If not specified, defaults to ""
+func StackV2StackName(value string) StackV2Attr {
+	return func(m optionalAttr) {
+		m["stack_name"] = value
+	}
+}
+
+// A stack that produces elements in first-in last-out order.
+//
+// Arguments:
+//	max_size: The maximum size of the stack if non-negative. If negative, the stack
+// size is unlimited.
+//	elem_type: The type of the elements on the stack.
+//
+// Returns The handle to the stack.
+func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional ...StackV2Attr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"elem_type": elem_type}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StackV2",
+		Input: []tf.Input{
+			max_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Increments variable pointed to by 'resource' until it reaches 'limit'.
+//
+// Arguments:
+//	resource: Should be from a scalar `Variable` node.
+//	limit: If incrementing ref would bring it above limit, instead generates an
+// 'OutOfRange' error.
+//
+//
+// Returns A copy of the input before increment. If nothing else modifies the
+// input, the values produced will all be distinct.
+func ResourceCountUpTo(scope *Scope, resource tf.Output, limit int64, T tf.DataType) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"limit": limit, "T": T}
+	opspec := tf.OpSpec{
+		Type: "ResourceCountUpTo",
+		Input: []tf.Input{
+			resource,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
+type ResourceApplyMomentumAttr func(optionalAttr)
+
+// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
+// If not specified, defaults to false
+func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
+//
+// want to use Nesterov momentum.
+//
+// accum = accum * momentum + grad
+// var -= lr * accum
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//	momentum: Momentum. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyMomentum",
+		Input: []tf.Input{
+			var_, accum, lr, grad, momentum,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ResourceSparseApplyKerasMomentumAttr is an optional argument to ResourceSparseApplyKerasMomentum.
+type ResourceSparseApplyKerasMomentumAttr func(optionalAttr)
+
+// ResourceSparseApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyKerasMomentumUseLocking(value bool) ResourceSparseApplyKerasMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceSparseApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, the tensor passed to compute grad will be
+// var + momentum * accum, so in the end, the var you get is actually
+// var + momentum * accum.
+// If not specified, defaults to false
+func ResourceSparseApplyKerasMomentumUseNesterov(value bool) ResourceSparseApplyKerasMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
+//
+// Set use_nesterov = True if you want to use Nesterov momentum.
+//
+// That is for rows we have grad for, we update var and accum as follows:
+//
+// accum = accum * momentum - lr * grad
+// var += accum
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	momentum: Momentum. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyKerasMomentumAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyKerasMomentum",
+		Input: []tf.Input{
+			var_, accum, lr, grad, indices, momentum,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// EnqueueTPUEmbeddingSparseBatchAttr is an optional argument to EnqueueTPUEmbeddingSparseBatch.
+type EnqueueTPUEmbeddingSparseBatchAttr func(optionalAttr)
+
+// EnqueueTPUEmbeddingSparseBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. Should be >= 0 and less than the number
+// of TPU cores in the task on which the node is placed.
+// If not specified, defaults to -1
+func EnqueueTPUEmbeddingSparseBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingSparseBatchAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// EnqueueTPUEmbeddingSparseBatchCombiners sets the optional combiners attribute to value.
+//
+// value: A list of string scalars, one for each embedding table that specify
+// how to normalize the embedding activations after weighted summation.
+// Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
+// the sum of the weights be 0 for 'mean' or the sum of the squared weights be
+// 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
+// all tables.
+// If not specified, defaults to <>
+func EnqueueTPUEmbeddingSparseBatchCombiners(value []string) EnqueueTPUEmbeddingSparseBatchAttr {
+	return func(m optionalAttr) {
+		m["combiners"] = value
+	}
+}
+
+// An op that enqueues TPUEmbedding input indices from a SparseTensor.
+//
+// This Op eases the porting of code that uses embedding_lookup_sparse(),
+// although some Python preprocessing of the SparseTensor arguments to
+// embedding_lookup_sparse() is required to produce the arguments to this Op,
+// since only a single EnqueueTPUEmbeddingSparseBatch Op is allowed per training
+// step.
+//
+// The tensors at corresponding positions in the three input lists
+// must have the same shape, i.e. rank 1 with dim_size() equal to the total
+// number of lookups into the table described by the corresponding table_id.
+//
+// Arguments:
+//	sample_indices: A list of rank 1 Tensors specifying the training example and
+// feature to which the corresponding embedding_indices and aggregation_weights
+// values belong. sample_indices[i] must equal b * nf + f, where nf is the
+// number of features from the corresponding table, f is in [0, nf), and
+// b is in [0, batch size).
+//	embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
+//	aggregation_weights: A list of rank 1 Tensors containing per sample -- i.e. per
+// (training example, feature) -- aggregation weights.
+//	mode_override: A string input that overrides the mode specified in the
+// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
+//
+// Returns the created operation.
+func EnqueueTPUEmbeddingSparseBatch(scope *Scope, sample_indices []tf.Output, embedding_indices []tf.Output, aggregation_weights []tf.Output, mode_override tf.Output, optional ...EnqueueTPUEmbeddingSparseBatchAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EnqueueTPUEmbeddingSparseBatch",
+		Input: []tf.Input{
+			tf.OutputList(sample_indices), tf.OutputList(embedding_indices), tf.OutputList(aggregation_weights), mode_override,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
+type FractionalMaxPoolGradAttr func(optionalAttr)
+
+// FractionalMaxPoolGradOverlapping sets the optional overlapping attribute to value.
+//
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [20, 16] for fractional max pooling.
+// If not specified, defaults to false
+func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// Computes gradient of the FractionalMaxPool function.
+//
+// Arguments:
+//	orig_input: Original input for `fractional_max_pool`
+//	orig_output: Original output for `fractional_max_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_max_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
+//
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
+func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FractionalMaxPoolGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the sum along sparse segments of a tensor.
+//
+// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/sparse#Segmentation)
+// for an explanation of segments.
+//
+// For example:
+//
+// ```python
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+//
+// tf.sparse_segment_sum_with_num_segments(
+//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
+// # => [[0 0 0 0]
+// #     [0 0 0 0]
+// #     [0 0 0 0]]
+//
+// tf.sparse_segment_sum_with_num_segments(c,
+//                                         tf.constant([0, 1]),
+//                                         tf.constant([0, 2],
+//                                         num_segments=4))
+// # => [[ 1  2  3  4]
+// #     [ 0  0  0  0]
+// #     [-1 -2 -3 -4]
+// #     [ 0  0  0  0]]
+// ```
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `num_segments`.
+func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentSumWithNumSegments",
+		Input: []tf.Input{
+			data, indices, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Quantized Batch normalization.
+//
+// This op is deprecated and will be removed in the future. Prefer
+// `tf.nn.batch_normalization`.
+//
+// Arguments:
+//	t: A 4D input Tensor.
+//	t_min: The value represented by the lowest quantized input.
+//	t_max: The value represented by the highest quantized input.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	m_min: The value represented by the lowest quantized mean.
+//	m_max: The value represented by the highest quantized mean.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v_min: The value represented by the lowest quantized variance.
+//	v_max: The value represented by the highest quantized variance.
+//	beta: A 1D beta Tensor with size matching the last dimension of t.
+// An offset to be added to the normalized tensor.
+//	beta_min: The value represented by the lowest quantized offset.
+//	beta_max: The value represented by the highest quantized offset.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this tensor will be multiplied
+// with the normalized tensor.
+//	gamma_min: The value represented by the lowest quantized gamma.
+//	gamma_max: The value represented by the highest quantized gamma.
+//
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+func QuantizedBatchNormWithGlobalNormalization(scope *Scope, t tf.Output, t_min tf.Output, t_max tf.Output, m tf.Output, m_min tf.Output, m_max tf.Output, v tf.Output, v_min tf.Output, v_max tf.Output, beta tf.Output, beta_min tf.Output, beta_max tf.Output, gamma tf.Output, gamma_min tf.Output, gamma_max tf.Output, out_type tf.DataType, variance_epsilon float32, scale_after_normalization bool) (result tf.Output, result_min tf.Output, result_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type, "variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	opspec := tf.OpSpec{
+		Type: "QuantizedBatchNormWithGlobalNormalization",
+		Input: []tf.Input{
+			t, t_min, t_max, m, m_min, m_max, v, v_min, v_max, beta, beta_min, beta_max, gamma, gamma_min, gamma_max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns a list list which has the passed-in `Tensor` as last element and the other elements of the given list in `input_handle`.
+//
+// tensor: The tensor to put on the list.
+// input_handle: The old list.
+// output_handle: A list with the elements of the old list followed by tensor.
+// element_dtype: the type of elements in the list.
+// element_shape: a shape compatible with that of elements in the list.
+func TensorListPushBack(scope *Scope, input_handle tf.Output, tensor tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListPushBack",
+		Input: []tf.Input{
+			input_handle, tensor,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Saves input tensors slices to disk.
+//
+// This is like `Save` except that tensors can be listed in the saved file as being
+// a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
+// larger tensor and the slice that this tensor covers. `shapes_and_slices` must
+// have as many elements as `tensor_names`.
+//
+// Elements of the `shapes_and_slices` input must either be:
+//
+// *  The empty string, in which case the corresponding tensor is
+//    saved normally.
+// *  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
+//    `dimI` are the dimensions of the larger tensor and `slice-spec`
+//    specifies what part is covered by the tensor to save.
+//
+// `slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
+// where each `sliceI` is either:
+//
+// *  The string `-` meaning that the slice covers all indices of this dimension
+// *  `start,length` where `start` and `length` are integers.  In that
+//    case the slice covers `length` indices starting at `start`.
+//
+// See also `Save`.
+//
+// Arguments:
+//	filename: Must have a single element. The name of the file to which we write the
+// tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
+// saving the tensors.
+//	data: `N` tensors to save.
+//
+// Returns the created operation.
+func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes_and_slices tf.Output, data []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SaveSlices",
+		Input: []tf.Input{
+			filename, tensor_names, shapes_and_slices, tf.OutputList(data),
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ExtractGlimpseAttr is an optional argument to ExtractGlimpse.
+type ExtractGlimpseAttr func(optionalAttr)
+
+// ExtractGlimpseCentered sets the optional centered attribute to value.
+//
+// value: indicates if the offset coordinates are centered relative to
+// the image, in which case the (0, 0) offset is relative to the center
+// of the input images. If false, the (0,0) offset corresponds to the
+// upper left corner of the input images.
+// If not specified, defaults to true
+func ExtractGlimpseCentered(value bool) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["centered"] = value
+	}
+}
+
+// ExtractGlimpseNormalized sets the optional normalized attribute to value.
+//
+// value: indicates if the offset coordinates are normalized.
+// If not specified, defaults to true
+func ExtractGlimpseNormalized(value bool) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["normalized"] = value
+	}
+}
+
+// ExtractGlimpseUniformNoise sets the optional uniform_noise attribute to value.
+//
+// value: indicates if the noise should be generated using a
+// uniform distribution or a Gaussian distribution.
+// If not specified, defaults to true
+func ExtractGlimpseUniformNoise(value bool) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["uniform_noise"] = value
+	}
+}
+
+// ExtractGlimpseNoise sets the optional noise attribute to value.
+//
+// value: indicates if the noise should `uniform`, `gaussian`, or
+// `zero`. The default is `uniform` which means the the noise type
+// will be decided by `uniform_noise`.
+// If not specified, defaults to "uniform"
+func ExtractGlimpseNoise(value string) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["noise"] = value
+	}
+}
+
+// Extracts a glimpse from the input tensor.
+//
+// Returns a set of windows called glimpses extracted at location
+// `offsets` from the input tensor. If the windows only partially
+// overlaps the inputs, the non overlapping areas will be filled with
+// random noise.
+//
+// The result is a 4-D tensor of shape `[batch_size, glimpse_height,
+// glimpse_width, channels]`. The channels and batch dimensions are the
+// same as that of the input tensor. The height and width of the output
+// windows are specified in the `size` parameter.
+//
+// The argument `normalized` and `centered` controls how the windows are built:
+//
+// * If the coordinates are normalized but not centered, 0.0 and 1.0
+//   correspond to the minimum and maximum of each height and width
+//   dimension.
+// * If the coordinates are both normalized and centered, they range from
+//   -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
+//   left corner, the lower right corner is located at (1.0, 1.0) and the
+//   center is at (0, 0).
+// * If the coordinates are not normalized they are interpreted as
+//   numbers of pixels.
+//
+// Arguments:
+//	input: A 4-D float tensor of shape `[batch_size, height, width, channels]`.
+//	size: A 1-D tensor of 2 elements containing the size of the glimpses
+// to extract.  The glimpse height must be specified first, following
+// by the glimpse width.
+//	offsets: A 2-D integer tensor of shape `[batch_size, 2]` containing
+// the y, x locations of the center of each window.
+//
+// Returns A tensor representing the glimpses `[batch_size,
+// glimpse_height, glimpse_width, channels]`.
+func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Output, optional ...ExtractGlimpseAttr) (glimpse tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ExtractGlimpse",
+		Input: []tf.Input{
+			input, size, offsets,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseReduceSumSparseAttr is an optional argument to SparseReduceSumSparse.
+type SparseReduceSumSparseAttr func(optionalAttr)
+
+// SparseReduceSumSparseKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceSumSparseKeepDims(value bool) SparseReduceSumSparseAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the sum of elements across dimensions of a SparseTensor.
+//
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
+// SparseTensor.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReduceSumSparse",
+		Input: []tf.Input{
+			input_indices, input_values, input_shape, reduction_axes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// RetrieveTPUEmbeddingMDLAdagradLightParametersAttr is an optional argument to RetrieveTPUEmbeddingMDLAdagradLightParameters.
+type RetrieveTPUEmbeddingMDLAdagradLightParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingMDLAdagradLightParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingMDLAdagradLightParametersTableId(value int64) RetrieveTPUEmbeddingMDLAdagradLightParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingMDLAdagradLightParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingMDLAdagradLightParametersTableName(value string) RetrieveTPUEmbeddingMDLAdagradLightParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve MDL Adagrad Light embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the MDL Adagrad Light optimization algorithm.Parameter accumulators updated by the MDL Adagrad Light optimization algorithm.Parameter weights updated by the MDL Adagrad Light optimization algorithm.Parameter benefits updated by the MDL Adagrad Light optimization algorithm.
+func RetrieveTPUEmbeddingMDLAdagradLightParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMDLAdagradLightParametersAttr) (parameters tf.Output, accumulators tf.Output, weights tf.Output, benefits tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingMDLAdagradLightParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// BatchAttr is an optional argument to Batch.
+type BatchAttr func(optionalAttr)
+
+// BatchMaxEnqueuedBatches sets the optional max_enqueued_batches attribute to value.
+// If not specified, defaults to 10
+func BatchMaxEnqueuedBatches(value int64) BatchAttr {
+	return func(m optionalAttr) {
+		m["max_enqueued_batches"] = value
+	}
+}
+
+// BatchAllowedBatchSizes sets the optional allowed_batch_sizes attribute to value.
+// If not specified, defaults to <>
+func BatchAllowedBatchSizes(value []int64) BatchAttr {
+	return func(m optionalAttr) {
+		m["allowed_batch_sizes"] = value
+	}
+}
+
+// BatchContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func BatchContainer(value string) BatchAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// VarHandleOpSharedName sets the optional shared_name attribute to value.
-//
-// value: the name by which this variable is referred to.
+// BatchSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func VarHandleOpSharedName(value string) VarHandleOpAttr {
+func BatchSharedName(value string) BatchAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Creates a handle to a Variable resource.
+// BatchBatchingQueue sets the optional batching_queue attribute to value.
+// If not specified, defaults to ""
+func BatchBatchingQueue(value string) BatchAttr {
+	return func(m optionalAttr) {
+		m["batching_queue"] = value
+	}
+}
+
+// Batches all input tensors nondeterministically.
 //
-// Arguments:
-//	dtype: the type of this variable. Must agree with the dtypes
-// of all ops using this variable.
-//	shape: The (possibly partially specified) shape of this variable.
-func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...VarHandleOpAttr) (resource tf.Output) {
+// When many instances of this Op are being run concurrently with the same
+// container/shared_name in the same device, some will output zero-shaped Tensors
+// and others will output Tensors of size up to max_batch_size.
+//
+// All Tensors in in_tensors are batched together (so, for example, labels and
+// features should be batched with a single instance of this operation.
+//
+// Each invocation of batch emits an `id` scalar which will be used to identify
+// this particular invocation when doing unbatch or its gradient.
+//
+// Each op which emits a non-empty batch will also emit a non-empty batch_index
+// Tensor, which, is a [K, 3] matrix where each row contains the invocation's id,
+// start, and length of elements of each set of Tensors present in batched_tensors.
+//
+// Batched tensors are concatenated along the first dimension, and all tensors in
+// in_tensors must have the first dimension of the same size.
+//
+// in_tensors: The tensors to be batched.
+// num_batch_threads: Number of scheduling threads for processing batches of work.
+//  Determines the number of batches processed in parallel.
+// max_batch_size: Batch sizes will never be bigger than this.
+// batch_timeout_micros: Maximum number of microseconds to wait before outputting
+//  an incomplete batch.
+// allowed_batch_sizes: Optional list of allowed batch sizes. If left empty, does
+//  nothing. Otherwise, supplies a list of batch sizes, causing the op to pad
+//  batches up to one of those sizes. The entries must increase monotonically, and
+//  the final entry must equal max_batch_size.
+// grad_timeout_micros: The timeout to use for the gradient. See Unbatch.
+// batched_tensors: Either empty tensors or a batch of concatenated Tensors.
+// batch_index: If out_tensors is non-empty, has information to invert it.
+// container: Controls the scope of sharing of this batch.
+// id: always contains a scalar with a unique ID for this invocation of Batch.
+// shared_name: Concurrently running instances of batch in the same device with the
+//  same container and shared_name will batch their elements together. If left
+//  empty, the op name will be used as the shared name.
+// T: the types of tensors to be batched.
+func Batch(scope *Scope, in_tensors []tf.Output, num_batch_threads int64, max_batch_size int64, batch_timeout_micros int64, grad_timeout_micros int64, optional ...BatchAttr) (batched_tensors []tf.Output, batch_index tf.Output, id tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	attrs := map[string]interface{}{"num_batch_threads": num_batch_threads, "max_batch_size": max_batch_size, "batch_timeout_micros": batch_timeout_micros, "grad_timeout_micros": grad_timeout_micros}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "VarHandleOp",
+		Type: "Batch",
+		Input: []tf.Input{
+			tf.OutputList(in_tensors),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if batched_tensors, idx, err = makeOutputList(op, idx, "batched_tensors"); err != nil {
+		scope.UpdateErr("Batch", err)
+		return
+	}
+	batch_index = op.Output(idx)
+	id = op.Output(idx)
+	return batched_tensors, batch_index, id
+}
+
+// CompilationResultProto indicating the status of the TPU compilation.
+func TPUCompilationResult(scope *Scope) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TPUCompilationResult",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
+type StatelessTruncatedNormalAttr func(optionalAttr)
+
+// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom values from a truncated normal distribution.
+//
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns Random values with specified shape.
+func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessTruncatedNormal",
+		Input: []tf.Input{
+			shape, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// 3D real-valued fast Fourier transform.
+//
+// Computes the 3-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most 3 dimensions of `input`.
+//
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
+// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+// of `output`: the zero-frequency term, followed by the `fft_length / 2`
+// positive-frequency terms.
+//
+// Along each axis `RFFT3D` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+//
+// Returns A complex64 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the their 3D Fourier transform. The
+//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
+//   components.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfftn with 3 dimensions.
+// @end_compatibility
+func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RFFT3D",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Extract `patches` from `images` and put them in the "depth" output dimension.
+//
+// Arguments:
+//	images: 4-D Tensor with shape `[batch, in_rows, in_cols, depth]`.
+//	ksizes: The size of the sliding window for each dimension of `images`.
+//	strides: 1-D of length 4. How far the centers of two consecutive patches are in
+// the images. Must be: `[1, stride_rows, stride_cols, 1]`.
+//	rates: 1-D of length 4. Must be: `[1, rate_rows, rate_cols, 1]`. This is the
+// input stride, specifying how far two consecutive patch samples are in the
+// input. Equivalent to extracting patches with
+// `patch_sizes_eff = patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by
+// subsampling them spatially by a factor of `rates`. This is equivalent to
+// `rate` in dilated (a.k.a. Atrous) convolutions.
+//	padding: The type of padding algorithm to use.
+//
+// We specify the size-related attributes as:
+//
+// ```python
+//       ksizes = [1, ksize_rows, ksize_cols, 1]
+//       strides = [1, strides_rows, strides_cols, 1]
+//       rates = [1, rates_rows, rates_cols, 1]
+// ```
+//
+// Returns 4-D Tensor with shape `[batch, out_rows, out_cols, ksize_rows *
+// ksize_cols * depth]` containing image patches with size
+// `ksize_rows x ksize_cols x depth` vectorized in the "depth" dimension. Note
+// `out_rows` and `out_cols` are the dimensions of the output patches.
+func ExtractImagePatches(scope *Scope, images tf.Output, ksizes []int64, strides []int64, rates []int64, padding string) (patches tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksizes": ksizes, "strides": strides, "rates": rates, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "ExtractImagePatches",
+		Input: []tf.Input{
+			images,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StringToNumberAttr is an optional argument to StringToNumber.
+type StringToNumberAttr func(optionalAttr)
+
+// StringToNumberOutType sets the optional out_type attribute to value.
+//
+// value: The numeric type to interpret each string in `string_tensor` as.
+// If not specified, defaults to DT_FLOAT
+func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Converts each string in the input Tensor to the specified numeric type.
+//
+// (Note that int32 overflow results in an error while float overflow
+// results in a rounded value.)
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringToNumber",
+		Input: []tf.Input{
+			string_tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResizeNearestNeighborGradAttr is an optional argument to ResizeNearestNeighborGrad.
+type ResizeNearestNeighborGradAttr func(optionalAttr)
+
+// ResizeNearestNeighborGradAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
+// If not specified, defaults to false
+func ResizeNearestNeighborGradAlignCorners(value bool) ResizeNearestNeighborGradAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// ResizeNearestNeighborGradHalfPixelCenters sets the optional half_pixel_centers attribute to value.
+// If not specified, defaults to false
+func ResizeNearestNeighborGradHalfPixelCenters(value bool) ResizeNearestNeighborGradAttr {
+	return func(m optionalAttr) {
+		m["half_pixel_centers"] = value
+	}
+}
+
+// Computes the gradient of nearest neighbor interpolation.
+//
+// Arguments:
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `orig_height, orig_width`. The
+// original input size.
+//
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`. Gradients
+// with respect to the input image.
+func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, optional ...ResizeNearestNeighborGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeNearestNeighborGrad",
+		Input: []tf.Input{
+			grads, size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Concatenates tensors along one dimension.
+//
+// Arguments:
+//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [0, rank(values)).
+//	values: The `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.
+func Concat(scope *Scope, concat_dim tf.Output, values []tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Concat",
+		Input: []tf.Input{
+			concat_dim, tf.OutputList(values),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingADAMParametersGradAccumDebug.
+type RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve ADAM embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the ADAM optimization algorithm.Parameter momenta updated by the ADAM optimization algorithm.Parameter velocities updated by the ADAM optimization algorithm.Parameter gradient_accumulators updated by the ADAM optimization algorithm.
+func RetrieveTPUEmbeddingADAMParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, velocities tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug",
 
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// Reduces sparse updates into the variable referenced by `resource` using the `max` operation.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] = max(ref[indices, ...], updates[...])
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] = max(ref[indices[i], ...], updates[i, ...])
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] = max(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions are combined.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterMax(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterMax",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes the Gauss error function of `x` element-wise.
+func Erf(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Erf",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Selects the k nearest centers for each point.
+//
+// Rows of points are assumed to be input points. Rows of centers are assumed to be
+// the list of candidate centers. For each point, the k centers that have least L2
+// distance to it are computed.
+//
+// Arguments:
+//	points: Matrix of shape (n, d). Rows are assumed to be input points.
+//	centers: Matrix of shape (m, d). Rows are assumed to be centers.
+//	k: Number of nearest centers to return for each point. If k is larger than m, then
+// only m centers are returned.
+//
+// Returns Matrix of shape (n, min(m, k)). Each row contains the indices of the centers
+// closest to the corresponding point, ordered by increasing distance.Matrix of shape (n, min(m, k)). Each row contains the squared L2 distance to the
+// corresponding center in nearest_center_indices.
+func NearestNeighbors(scope *Scope, points tf.Output, centers tf.Output, k tf.Output) (nearest_center_indices tf.Output, nearest_center_distances tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NearestNeighbors",
+		Input: []tf.Input{
+			points, centers, k,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
+type TensorArrayGatherV3Attr func(optionalAttr)
+
+// TensorArrayGatherV3ElementShape sets the optional element_shape attribute to value.
+//
+// value: The expected shape of an element, if known. Used to
+// validate the shapes of TensorArray elements. If this shape is not
+// fully specified, gathering zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// Gather specific elements from the TensorArray into output `value`.
+//
+// All elements selected by `indices` must have the same shape.
+//
+// Arguments:
+//	handle: The handle to a TensorArray.
+//	indices: The locations in the TensorArray from which to read tensor elements.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
+//
+// Returns All of the elements in the TensorArray, concatenated along a new
+// axis (the new dimension 0).
+func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV3Attr) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayGatherV3",
+		Input: []tf.Input{
+			handle, indices, flow_in,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
+type SdcaOptimizerAttr func(optionalAttr)
+
+// SdcaOptimizerAdaptative sets the optional adaptative attribute to value.
+//
+// value: Whether to use Adaptive SDCA for the inner loop.
+// If not specified, defaults to true
+func SdcaOptimizerAdaptative(value bool) SdcaOptimizerAttr {
+	return func(m optionalAttr) {
+		m["adaptative"] = value
+	}
+}
+
+// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
+//
+// linear models with L1 + L2 regularization. As global optimization objective is
+// strongly-convex, the optimizer optimizes the dual objective at each step. The
+// optimizer applies each update one example at a time. Examples are sampled
+// uniformly, and the optimizer is learning rate free and enjoys linear convergence
+// rate.
+//
+// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
+// Shai Shalev-Shwartz, Tong Zhang. 2012
+//
+// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
+//
+// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
+// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
+// Peter Richtarik, Martin Takac. 2015
+//
+// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
+// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
+//
+// Arguments:
+//	sparse_example_indices: a list of vectors which contain example indices.
+//	sparse_feature_indices: a list of vectors which contain feature indices.
+//	sparse_feature_values: a list of vectors which contains feature value
+// associated with each feature group.
+//	dense_features: a list of matrices which contains the dense feature values.
+//	example_weights: a vector which contains the weight associated with each
+// example.
+//	example_labels: a vector which contains the label/target associated with each
+// example.
+//	sparse_indices: a list of vectors where each value is the indices which has
+// corresponding weights in sparse_weights. This field maybe omitted for the
+// dense approach.
+//	sparse_weights: a list of vectors where each value is the weight associated with
+// a sparse feature group.
+//	dense_weights: a list of vectors where the values are the weights associated
+// with a dense feature group.
+//	example_state_data: a list of vectors containing the example state data.
+//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
+// squared and hinge losses.
+//	l1: Symmetric l1 regularization strength.
+//	l2: Symmetric l2 regularization strength.
+//	num_loss_partitions: Number of partitions of the global loss function.
+//	num_inner_iterations: Number of iterations per mini-batch.
+//
+// Returns a list of vectors containing the updated example state
+// data.a list of vectors where each value is the delta
+// weights associated with a sparse feature group.a list of vectors where the values are the delta
+// weights associated with a dense feature group.
+func SdcaOptimizer(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerAttr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SdcaOptimizer",
+		Input: []tf.Input{
+			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	out_example_state_data = op.Output(idx)
+	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizer", err)
+		return
+	}
+	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizer", err)
+		return
+	}
+	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
+}
+
+// The gradient of SparseFillEmptyRows.
+//
+// Takes vectors reverse_index_map, shaped `[N]`, and grad_values,
+// shaped `[N_full]`, where `N_full >= N` and copies data into either
+// `d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and
+// `d_default_value` is a scalar.
+//
+//   d_values[j] = grad_values[reverse_index_map[j]]
+//   d_default_value = sum_{k : 0 .. N_full - 1} (
+//      grad_values[k] * 1{k not in reverse_index_map})
+//
+// Arguments:
+//	reverse_index_map: 1-D.  The reverse index map from SparseFillEmptyRows.
+//	grad_values: 1-D.  The gradients from backprop.
+//
+// Returns 1-D.  The backprop into values.0-D.  The backprop into default_value.
+func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_values tf.Output) (d_values tf.Output, d_default_value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseFillEmptyRowsGrad",
+		Input: []tf.Input{
+			reverse_index_map, grad_values,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// ResourceApplyAdaMaxAttr is an optional argument to ResourceApplyAdaMax.
+type ResourceApplyAdaMaxAttr func(optionalAttr)
+
+// ResourceApplyAdaMaxUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdaMaxUseLocking(value bool) ResourceApplyAdaMaxAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the AdaMax algorithm.
+//
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// v_t <- max(beta2 * v_{t-1}, abs(g))
+// variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdaMax(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdaMaxAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdaMax",
+		Input: []tf.Input{
+			var_, m, v, beta1_power, lr, beta1, beta2, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
+type ResourceSparseApplyMomentumAttr func(optionalAttr)
+
+// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
+// If not specified, defaults to false
+func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
+//
+// Set use_nesterov = True if you want to use Nesterov momentum.
+//
+// That is for rows we have grad for, we update var and accum as follows:
+//
+// accum = accum * momentum + grad
+// var -= lr * accum
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	momentum: Momentum. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyMomentum",
+		Input: []tf.Input{
+			var_, accum, lr, grad, indices, momentum,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// RetrieveTPUEmbeddingMomentumParametersAttr is an optional argument to RetrieveTPUEmbeddingMomentumParameters.
+type RetrieveTPUEmbeddingMomentumParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingMomentumParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingMomentumParametersTableId(value int64) RetrieveTPUEmbeddingMomentumParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingMomentumParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingMomentumParametersTableName(value string) RetrieveTPUEmbeddingMomentumParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve Momentum embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the Momentum optimization algorithm.Parameter momenta updated by the Momentum optimization algorithm.
+func RetrieveTPUEmbeddingMomentumParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersAttr) (parameters tf.Output, momenta tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingMomentumParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Creates a TensorArray for storing multiple gradients of values in the given handle.
+//
+// Similar to TensorArrayGradV3. However it creates an accumulator with an
+// expanded shape compared to the input TensorArray whose gradient is being
+// computed. This enables multiple gradients for the same TensorArray to be
+// calculated using the same accumulator.
+//
+// Arguments:
+//	handle: The handle to the forward TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	shape_to_prepend: An int32 vector representing a shape. Elements in the gradient accumulator will
+// have shape which is this shape_to_prepend value concatenated with shape of the
+// elements in the TensorArray corresponding to the input handle.
+//	source: The gradient source string, used to decide which gradient TensorArray
+// to return.
+func TensorArrayGradWithShape(scope *Scope, handle tf.Output, flow_in tf.Output, shape_to_prepend tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"source": source}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayGradWithShape",
+		Input: []tf.Input{
+			handle, flow_in, shape_to_prepend,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// RetrieveTPUEmbeddingRMSPropParametersAttr is an optional argument to RetrieveTPUEmbeddingRMSPropParameters.
+type RetrieveTPUEmbeddingRMSPropParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingRMSPropParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingRMSPropParametersTableId(value int64) RetrieveTPUEmbeddingRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingRMSPropParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingRMSPropParametersTableName(value string) RetrieveTPUEmbeddingRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve RMSProp embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the RMSProp optimization algorithm.Parameter ms updated by the RMSProp optimization algorithm.Parameter mom updated by the RMSProp optimization algorithm.
+func RetrieveTPUEmbeddingRMSPropParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingRMSPropParametersAttr) (parameters tf.Output, ms tf.Output, mom tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingRMSPropParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// DecodeRawAttr is an optional argument to DecodeRaw.
+type DecodeRawAttr func(optionalAttr)
+
+// DecodeRawLittleEndian sets the optional little_endian attribute to value.
+//
+// value: Whether the input `bytes` are in little-endian order.
+// Ignored for `out_type` values that are stored in a single byte like
+// `uint8`.
+// If not specified, defaults to true
+func DecodeRawLittleEndian(value bool) DecodeRawAttr {
+	return func(m optionalAttr) {
+		m["little_endian"] = value
+	}
+}
+
+// Reinterpret the bytes of a string as a vector of numbers.
+//
+// Arguments:
+//	bytes: All the elements must have the same length.
+//
+//
+// Returns A Tensor with one more dimension than the input `bytes`.  The
+// added dimension will have size equal to the length of the elements
+// of `bytes` divided by the number of bytes to represent `out_type`.
+func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeRaw",
+		Input: []tf.Input{
+			bytes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TensorListStackAttr is an optional argument to TensorListStack.
+type TensorListStackAttr func(optionalAttr)
+
+// TensorListStackNumElements sets the optional num_elements attribute to value.
+// If not specified, defaults to -1
+func TensorListStackNumElements(value int64) TensorListStackAttr {
+	return func(m optionalAttr) {
+		m["num_elements"] = value
+	}
+}
+
+// Stacks all tensors in the list.
+//
+// Requires that all tensors have the same shape.
+//
+// input_handle: the input list
+// tensor: the gathered result
+// num_elements: optional. If not -1, the number of elements in the list.
+//
+func TensorListStack(scope *Scope, input_handle tf.Output, element_shape tf.Output, element_dtype tf.DataType, optional ...TensorListStackAttr) (tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListStack",
+		Input: []tf.Input{
+			input_handle, element_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingProximalAdagradParametersAttr is an optional argument to RetrieveTPUEmbeddingProximalAdagradParameters.
+type RetrieveTPUEmbeddingProximalAdagradParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingProximalAdagradParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingProximalAdagradParametersTableId(value int64) RetrieveTPUEmbeddingProximalAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingProximalAdagradParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingProximalAdagradParametersTableName(value string) RetrieveTPUEmbeddingProximalAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve proximal Adagrad embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the proximal Adagrad optimization algorithm.Parameter accumulators updated by the proximal Adagrad optimization algorithm.
+func RetrieveTPUEmbeddingProximalAdagradParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingProximalAdagradParametersAttr) (parameters tf.Output, accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingProximalAdagradParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug.
+type RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve proximal Adagrad embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the proximal Adagrad optimization algorithm.Parameter accumulators updated by the proximal Adagrad optimization algorithm.Parameter gradient_accumulators updated by the proximal Adagrad optimization algorithm.
+func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingMomentumParametersGradAccumDebug.
+type LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load Momentum embedding parameters with debug support.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the Momentum optimization algorithm.
+//	momenta: Value of momenta used in the Momentum optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the Momentum optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingMomentumParametersGradAccumDebug",
+		Input: []tf.Input{
+			parameters, momenta, gradient_accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// CudnnRNNBackpropV3Attr is an optional argument to CudnnRNNBackpropV3.
+type CudnnRNNBackpropV3Attr func(optionalAttr)
+
+// CudnnRNNBackpropV3RnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNBackpropV3RnnMode(value string) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNBackpropV3InputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNBackpropV3InputMode(value string) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNBackpropV3Direction sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNBackpropV3Direction(value string) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNBackpropV3Dropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropV3Dropout(value float32) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNBackpropV3Seed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropV3Seed(value int64) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNBackpropV3Seed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropV3Seed2(value int64) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// CudnnRNNBackpropV3TimeMajor sets the optional time_major attribute to value.
+// If not specified, defaults to true
+func CudnnRNNBackpropV3TimeMajor(value bool) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["time_major"] = value
+	}
+}
+
+// Backprop step of CudnnRNNV3.
+//
+// Compute the backprop of both data and weights in a RNN. Takes an extra
+//     "sequence_lengths" input than CudnnRNNBackprop.
+//
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicates whether there is a linear projection between the input and
+//     the actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: If time_major is true, this is a 3-D tensor with the shape of
+//     [seq_length, batch_size, input_size]. If time_major is false, the shape is
+//     [batch_size, seq_length, input_size].
+// input_h: If time_major is true, this is a 3-D tensor with the shape of
+//     [num_layer * dir, batch_size, num_units]. If time_major is false, the shape
+//     is [batch_size, num_layer * dir, num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// sequence_lengths: a vector of lengths of each input sequence.
+// output: If time_major is true, this is a 3-D tensor with the shape of
+//     [seq_length, batch_size, dir * num_units]. If time_major is false, the
+//     shape is [batch_size, seq_length, dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
+// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
+//     pass.
+// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
+//     pass.
+// time_major: Indicates whether the input/output format is time major or batch
+//     major.
+// reserve_space: The same reserve_space produced in the forward operation.
+// input_backprop: The backprop to input in the forward pass. Has the same shape
+//     as input.
+// input_h_backprop: The backprop to input_h in the forward pass. Has the same
+//     shape as input_h.
+// input_c_backprop: The backprop to input_c in the forward pass. Has the same
+//     shape as input_c.
+// params_backprop: The backprop to the params buffer in the forward pass. Has the
+//     same shape as params.
+func CudnnRNNBackpropV3(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, sequence_lengths tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, host_reserved tf.Output, optional ...CudnnRNNBackpropV3Attr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNNBackpropV3",
+		Input: []tf.Input{
+			input, input_h, input_c, params, sequence_lengths, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space, host_reserved,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// Makes the summary of quantiles for the batch.
+//
+// An op that takes a list of tensors (one tensor per feature) and outputs the
+// quantile summaries for each tensor.
+//
+// Arguments:
+//	float_values: float; List of Rank 1 Tensors each containing values for a single feature.
+//	example_weights: float; Rank 1 Tensor with weights per instance.
+//	epsilon: float; The required maximum approximation error.
+//
+// Returns float; List of Rank 2 Tensors each containing the quantile summary
+// (value, weight, min_rank, max_rank) of a single feature.
+func BoostedTreesMakeQuantileSummaries(scope *Scope, float_values []tf.Output, example_weights tf.Output, epsilon tf.Output) (summaries []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesMakeQuantileSummaries",
+		Input: []tf.Input{
+			tf.OutputList(float_values), example_weights, epsilon,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if summaries, idx, err = makeOutputList(op, idx, "summaries"); err != nil {
+		scope.UpdateErr("BoostedTreesMakeQuantileSummaries", err)
+		return
+	}
+	return summaries
+}
+
+// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
+type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
+
+// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// var: Should be from a Variable().
+//
+// Arguments:
+//
+//	accum: Should be from a Variable().
+//	accum_update: : Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyAdadelta",
+		Input: []tf.Input{
+			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// EnqueueTPUEmbeddingIntegerBatchAttr is an optional argument to EnqueueTPUEmbeddingIntegerBatch.
+type EnqueueTPUEmbeddingIntegerBatchAttr func(optionalAttr)
+
+// EnqueueTPUEmbeddingIntegerBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. Should be >= 0 and less than the number
+// of TPU cores in the task on which the node is placed.
+// If not specified, defaults to -1
+func EnqueueTPUEmbeddingIntegerBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingIntegerBatchAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// An op that enqueues a list of input batch tensors to TPUEmbedding.
+//
+// Arguments:
+//	batch: A list of 1D tensors, one for each embedding table, containing the
+// indices into the tables.
+//	mode_override: A string input that overrides the mode specified in the
+// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
+//
+// Returns the created operation.
+func EnqueueTPUEmbeddingIntegerBatch(scope *Scope, batch []tf.Output, mode_override tf.Output, optional ...EnqueueTPUEmbeddingIntegerBatchAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EnqueueTPUEmbeddingIntegerBatch",
+		Input: []tf.Input{
+			tf.OutputList(batch), mode_override,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes gradients for the scaled exponential linear (Selu) operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Selu operation.
+//	outputs: The outputs of the corresponding Selu operation.
+//
+// Returns The gradients: `gradients * (outputs + scale * alpha)`
+// if outputs < 0, `scale * gradients` otherwise.
+func SeluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SeluGrad",
+		Input: []tf.Input{
+			gradients, outputs,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Draw bounding boxes on a batch of images.
+//
+// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
+// boxes specified by the locations in `boxes`. The coordinates of the each
+// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example, if an image is 100 x 200 pixels (height x width) and the bounding
+// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
+// the bounding box will be `(40, 10)` to `(180, 50)` (in (x,y) coordinates).
+//
+// Parts of the bounding box may fall outside the image.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
+//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
+// boxes.
+//
+// Returns 4-D with the same shape as `images`. The batch of input images with
+// bounding boxes drawn on the images.
+func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DrawBoundingBoxes",
+		Input: []tf.Input{
+			images, boxes,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RegexReplaceAttr is an optional argument to RegexReplace.
+type RegexReplaceAttr func(optionalAttr)
+
+// RegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
+//
+// value: If True, the replacement is global (that is, all matches of the `pattern` regular
+// expression in each input string are rewritten), otherwise the `rewrite`
+// substitution is only made for the first `pattern` match.
+// If not specified, defaults to true
+func RegexReplaceReplaceGlobal(value bool) RegexReplaceAttr {
+	return func(m optionalAttr) {
+		m["replace_global"] = value
+	}
+}
+
+// Replaces matches of the `pattern` regular expression in `input` with the
+// replacement string provided in `rewrite`.
+//
+// It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+//
+// Arguments:
+//	input: The text to be processed.
+//	pattern: The regular expression to be matched in the `input` strings.
+//	rewrite: The rewrite string to be substituted for the `pattern` expression where it is
+// matched in the `input` strings.
+//
+// Returns The text after applying pattern match and rewrite substitution.
+func RegexReplace(scope *Scope, input tf.Output, pattern tf.Output, rewrite tf.Output, optional ...RegexReplaceAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RegexReplace",
+		Input: []tf.Input{
+			input, pattern, rewrite,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AssertAttr is an optional argument to Assert.
+type AssertAttr func(optionalAttr)
+
+// AssertSummarize sets the optional summarize attribute to value.
+//
+// value: Print this many entries of each tensor.
+// If not specified, defaults to 3
+func AssertSummarize(value int64) AssertAttr {
+	return func(m optionalAttr) {
+		m["summarize"] = value
+	}
+}
+
+// Asserts that the given condition is true.
+//
+// If `condition` evaluates to false, print the list of tensors in `data`.
+// `summarize` determines how many entries of the tensors to print.
+//
+// Arguments:
+//	condition: The condition to evaluate.
+//	data: The tensors to print out when condition is false.
+//
+// Returns the created operation.
+func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Assert",
+		Input: []tf.Input{
+			condition, tf.OutputList(data),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns x / y element-wise for real types.
+//
+// If `x` and `y` are reals, this will return the floating-point division.
+//
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RealDiv",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// A dataset that creates window datasets from the input dataset.
+//
+// Arguments:
+//
+//	size: A scalar representing the number of elements to accumulate in a window.
+//	shift: A scalar representing the steps moving the sliding window forward in one
+// iteration. It must be positive.
+//	stride: A scalar representing the stride of the input elements of the sliding window.
+// It must be positive.
+//	drop_remainder: A scalar representing whether a window should be dropped in case its size is
+// smaller than desired.
+//
+//
+func WindowDataset(scope *Scope, input_dataset tf.Output, size tf.Output, shift tf.Output, stride tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "WindowDataset",
+		Input: []tf.Input{
+			input_dataset, size, shift, stride, drop_remainder,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StatefulStandardNormalAttr is an optional argument to StatefulStandardNormal.
+type StatefulStandardNormalAttr func(optionalAttr)
+
+// StatefulStandardNormalDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatefulStandardNormalDtype(value tf.DataType) StatefulStandardNormalAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs random values from a normal distribution. This op is deprecated in favor of op 'StatefulStandardNormalV2'
+//
+// DEPRECATED at GraphDef version 29: Use StatefulStandardNormalV2 instead
+//
+// The generated values will have mean 0 and standard deviation 1.
+//
+// Arguments:
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	shape: The shape of the output tensor.
+//
+// Returns A tensor of the specified shape filled with random normal values.
+func StatefulStandardNormal(scope *Scope, resource tf.Output, shape tf.Output, optional ...StatefulStandardNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatefulStandardNormal",
+		Input: []tf.Input{
+			resource, shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// 2D real-valued fast Fourier transform.
+//
+// Computes the 2-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most 2 dimensions of `input`.
+//
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
+// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+// of `output`: the zero-frequency term, followed by the `fft_length / 2`
+// positive-frequency terms.
+//
+// Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+//
+// Returns A complex64 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their 2D Fourier transform. The
+//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
+//   components.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfft2
+// @end_compatibility
+func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RFFT2D",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// A placeholder op for a value that will be fed into the computation.
+//
+// Arguments:
+//	dtype: The type of elements in the tensor.
+//	shape: The shape of the tensor.
+//
+// Returns A tensor that will be provided using the infeed mechanism.
+func InfeedDequeue(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	opspec := tf.OpSpec{
+		Type: "InfeedDequeue",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StaticRegexReplaceAttr is an optional argument to StaticRegexReplace.
+type StaticRegexReplaceAttr func(optionalAttr)
+
+// StaticRegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
+//
+// value: If True, the replacement is global, otherwise the replacement
+// is done only on the first match.
+// If not specified, defaults to true
+func StaticRegexReplaceReplaceGlobal(value bool) StaticRegexReplaceAttr {
+	return func(m optionalAttr) {
+		m["replace_global"] = value
+	}
+}
+
+// Replaces the match of pattern in input with rewrite.
+//
+// It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+//
+// Arguments:
+//	input: The text to be processed.
+//	pattern: The regular expression to match the input.
+//	rewrite: The rewrite to be applied to the matched expression.
+//
+// Returns The text after applying pattern and rewrite.
+func StaticRegexReplace(scope *Scope, input tf.Output, pattern string, rewrite string, optional ...StaticRegexReplaceAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"pattern": pattern, "rewrite": rewrite}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StaticRegexReplace",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
+type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
+
+// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the centered RMSProp algorithm.
+//
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
+//
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// mean_grad = decay * mean_grad + (1-decay) * gradient
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	mg: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
+//
+// Returns the created operation.
+func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyCenteredRMSProp",
+		Input: []tf.Input{
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Deprecated. Use TensorArraySplitV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArraySplitV3
+func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArraySplitV2",
+		Input: []tf.Input{
+			handle, value, lengths, flow_in,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
+//
+// Arguments:
+//
+//	thread_pool: A resource produced by the ThreadPoolHandle op.
+//
+//
+func ExperimentalThreadPoolDataset(scope *Scope, input_dataset tf.Output, thread_pool tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalThreadPoolDataset",
+		Input: []tf.Input{
+			input_dataset, thread_pool,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Check if the input matches the regex pattern.
+//
+// The input is a string tensor of any shape. The pattern is the
+// regular expression to be matched with every element of the input tensor.
+// The boolean values (True or False) of the output tensor indicate
+// if the input matches the regex pattern provided.
+//
+// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+//
+// Arguments:
+//	input: A string tensor of the text to be processed.
+//	pattern: The regular expression to match the input.
+//
+// Returns A bool tensor with the same shape as `input`.
+func StaticRegexFullMatch(scope *Scope, input tf.Output, pattern string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"pattern": pattern}
+	opspec := tf.OpSpec{
+		Type: "StaticRegexFullMatch",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
+//
+// The hash function is deterministic on the content of the string within the
+// process and will never change. However, it is not suitable for cryptography.
+// This function may be used when CPU time is scarce and inputs are trusted or
+// unimportant. There is a risk of adversaries constructing inputs that all hash
+// to the same bucket. To prevent this problem, use a strong hash function with
+// `tf.string_to_hash_bucket_strong`.
+//
+// Arguments:
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
+	opspec := tf.OpSpec{
+		Type: "StringToHashBucketFast",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
+//
+// The hash function is deterministic on the content of the string within the
+// process. The hash function is a keyed hash function, where attribute `key`
+// defines the key of the hash function. `key` is an array of 2 elements.
+//
+// A strong hash is important when inputs may be malicious, e.g. URLs with
+// additional components. Adversaries could try to make their inputs hash to the
+// same bucket for a denial-of-service attack or to skew the results. A strong
+// hash can be used to make it difficult to find inputs with a skewed hash value
+// distribution over buckets. This requires that the hash function is
+// seeded by a high-entropy (random) "key" unknown to the adversary.
+//
+// The additional robustness comes at a cost of roughly 4x higher compute
+// time than `tf.string_to_hash_bucket_fast`.
+//
+// Arguments:
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
+//	key: The key used to seed the hash function, passed as a list of two uint64
+// elements.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64, key []int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets, "key": key}
+	opspec := tf.OpSpec{
+		Type: "StringToHashBucketStrong",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// UnbatchGradAttr is an optional argument to UnbatchGrad.
+type UnbatchGradAttr func(optionalAttr)
+
+// UnbatchGradContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func UnbatchGradContainer(value string) UnbatchGradAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// UnbatchGradSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func UnbatchGradSharedName(value string) UnbatchGradAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Gradient of Unbatch.
+//
+// Acts like Batch but using the given batch_index index of batching things as they
+// become available. This ensures that the gradients are propagated back in the
+// same session which did the forward pass.
+//
+// original_input: The input to the Unbatch operation this is the gradient of.
+// batch_index: The batch_index given to the Unbatch operation this is the gradient
+// of.
+// grad: The downstream gradient.
+// id: The id scalar emitted by Batch.
+// batched_grad: The return value, either an empty tensor or the batched gradient.
+// container: Container to control resource sharing.
+// shared_name: Instances of UnbatchGrad with the same container and shared_name
+//  are assumed to possibly belong to the same batch. If left empty, the op name
+//  will be used as the shared name.
+func UnbatchGrad(scope *Scope, original_input tf.Output, batch_index tf.Output, grad tf.Output, id tf.Output, optional ...UnbatchGradAttr) (batched_grad tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UnbatchGrad",
+		Input: []tf.Input{
+			original_input, batch_index, grad, id,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of (x != y) element-wise.
+//
+// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NotEqual",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AsStringAttr is an optional argument to AsString.
+type AsStringAttr func(optionalAttr)
+
+// AsStringPrecision sets the optional precision attribute to value.
+//
+// value: The post-decimal precision to use for floating point numbers.
+// Only used if precision > -1.
+// If not specified, defaults to -1
+func AsStringPrecision(value int64) AsStringAttr {
+	return func(m optionalAttr) {
+		m["precision"] = value
+	}
+}
+
+// AsStringScientific sets the optional scientific attribute to value.
+//
+// value: Use scientific notation for floating point numbers.
+// If not specified, defaults to false
+func AsStringScientific(value bool) AsStringAttr {
+	return func(m optionalAttr) {
+		m["scientific"] = value
+	}
+}
+
+// AsStringShortest sets the optional shortest attribute to value.
+//
+// value: Use shortest representation (either scientific or standard) for
+// floating point numbers.
+// If not specified, defaults to false
+func AsStringShortest(value bool) AsStringAttr {
+	return func(m optionalAttr) {
+		m["shortest"] = value
+	}
+}
+
+// AsStringWidth sets the optional width attribute to value.
+//
+// value: Pad pre-decimal numbers to this width.
+// Applies to both floating point and integer numbers.
+// Only used if width > -1.
+// If not specified, defaults to -1
+func AsStringWidth(value int64) AsStringAttr {
+	return func(m optionalAttr) {
+		m["width"] = value
+	}
+}
+
+// AsStringFill sets the optional fill attribute to value.
+//
+// value: The value to pad if width > -1.  If empty, pads with spaces.
+// Another typical value is '0'.  String cannot be longer than 1 character.
+// If not specified, defaults to ""
+func AsStringFill(value string) AsStringAttr {
+	return func(m optionalAttr) {
+		m["fill"] = value
+	}
+}
+
+// Converts each entry in the given tensor to strings.  Supports many numeric
+//
+// types and boolean.
+func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AsString",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StringJoinAttr is an optional argument to StringJoin.
+type StringJoinAttr func(optionalAttr)
+
+// StringJoinSeparator sets the optional separator attribute to value.
+//
+// value: string, an optional join separator.
+// If not specified, defaults to ""
+func StringJoinSeparator(value string) StringJoinAttr {
+	return func(m optionalAttr) {
+		m["separator"] = value
+	}
+}
+
+// Joins the strings in the given list of string tensors into one tensor;
+//
+// with the given separator (default is an empty separator).
+//
+// Arguments:
+//	inputs: A list of string tensors.  The tensors must all have the same shape,
+// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
+// of non-scalar inputs.
+func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringJoin",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Slice a `SparseTensor` based on the `start` and `size`.
+//
+// For example, if the input is
+//
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
+//
+// Graphically the output tensors are:
+//
+//     sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
+//
+//     sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
+//
+// Arguments:
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+//	start: 1-D. tensor represents the start of the slice.
+//	size: 1-D. tensor represents the size of the slice.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
+//
+// Returns A list of 1-D tensors represents the values of the output sparse
+// tensors.A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSlice(scope *Scope, indices tf.Output, values tf.Output, shape tf.Output, start tf.Output, size tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSlice",
+		Input: []tf.Input{
+			indices, values, shape, start, size,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Inserts a dimension of 1 into a tensor's shape.
+//
+// Given a tensor `input`, this operation inserts a dimension of 1 at the
+// dimension index `axis` of `input`'s shape. The dimension index `axis` starts at
+// zero; if you specify a negative number for `axis` it is counted backward from
+// the end.
+//
+// This operation is useful if you want to add a batch dimension to a single
+// element. For example, if you have a single image of shape `[height, width,
+// channels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,
+// which will make the shape `[1, height, width, channels]`.
+//
+// Other examples:
+//
+// ```
+// # 't' is a tensor of shape [2]
+// shape(expand_dims(t, 0)) ==> [1, 2]
+// shape(expand_dims(t, 1)) ==> [2, 1]
+// shape(expand_dims(t, -1)) ==> [2, 1]
+//
+// # 't2' is a tensor of shape [2, 3, 5]
+// shape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]
+// shape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]
+// shape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]
+// ```
+//
+// This operation requires that:
+//
+// `-1-input.dims() <= dim <= input.dims()`
+//
+// This operation is related to `squeeze()`, which removes dimensions of
+// size 1.
+//
+// Arguments:
+//
+//	axis: 0-D (scalar). Specifies the dimension index at which to
+// expand the shape of `input`. Must be in the range
+// `[-rank(input) - 1, rank(input)]`.
+//
+// Returns Contains the same data as `input`, but its shape has an additional
+// dimension of size 1 added.
+func ExpandDims(scope *Scope, input tf.Output, axis tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ExpandDims",
+		Input: []tf.Input{
+			input, axis,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Inverse real-valued fast Fourier transform.
+//
+// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most dimension of `input`.
+//
+// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
+// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
+// `fft_length` is not provided, it is computed from the size of the inner-most
+// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
+// compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
+// than the corresponding dimension of `input`, the dimension is cropped. If it is
+// larger, the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
+//
+// Returns A float32 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length` samples of its inverse
+//   1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft
+// @end_compatibility
+func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IRFFT",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Subtracts a value from the current value of a variable.
+//
+// Any ReadVariableOp with a control dependency on this op is guaranteed to
+// see the decremented value or a subsequent newer one.
+//
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
+//
+// Returns the created operation.
+func AssignSubVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AssignSubVariableOp",
+		Input: []tf.Input{
+			resource, value,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// SerializeManySparseAttr is an optional argument to SerializeManySparse.
+type SerializeManySparseAttr func(optionalAttr)
+
+// SerializeManySparseOutType sets the optional out_type attribute to value.
+//
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
+//
+// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
+// is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The serialized
+// `SparseTensor` objects going into each row of `serialized_sparse` will have
+// rank `R-1`.
+//
+// The minibatch size `N` is extracted from `sparse_shape[0]`.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SerializeManySparse",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the determinant of one or more square matrices.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor containing the determinants
+// for all input submatrices `[..., :, :]`.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[...]`.
+func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixDeterminant",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// An Op to exchange data across TPU replicas.
+//
+// On each replica, the input is split into `split_count` blocks along
+// `split_dimension` and send to the other replicas given group_assignment. After
+// receiving `split_count` - 1 blocks from other replicas, we concatenate the
+// blocks along `concat_dimension` as the output.
+//
+// For example, suppose there are 2 TPU replicas:
+// replica 0 receives input: `[[A, B]]`
+// replica 1 receives input: `[[C, D]]`
+//
+// group_assignment=`[[0, 1]]`
+// concat_dimension=0
+// split_dimension=1
+// split_count=2
+//
+// replica 0's output: `[[A], [C]]`
+// replica 1's output: `[[B], [D]]`
+//
+// Arguments:
+//	input: The local input to the sum.
+//	group_assignment: An int32 tensor with shape
+// [num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
+// replica ids in the ith subgroup.
+//	concat_dimension: The dimension number to concatenate.
+//	split_dimension: The dimension number to split.
+//	split_count: The number of splits, this number must equal to the sub-group
+// size(group_assignment.get_shape()[1])
+//
+// Returns The exchanged result.
+func AllToAll(scope *Scope, input tf.Output, group_assignment tf.Output, concat_dimension int64, split_dimension int64, split_count int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"concat_dimension": concat_dimension, "split_dimension": split_dimension, "split_count": split_count}
+	opspec := tf.OpSpec{
+		Type: "AllToAll",
+		Input: []tf.Input{
+			input, group_assignment,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SubstrAttr is an optional argument to Substr.
+type SubstrAttr func(optionalAttr)
+
+// SubstrUnit sets the optional unit attribute to value.
+//
+// value: The unit that is used to create the substring.  One of: `"BYTE"` (for
+// defining position and length by bytes) or `"UTF8_CHAR"` (for the UTF-8
+// encoded Unicode code points).  The default is `"BYTE"`. Results are undefined if
+// `unit=UTF8_CHAR` and the `input` strings do not contain structurally valid
+// UTF-8.
+// If not specified, defaults to "BYTE"
+func SubstrUnit(value string) SubstrAttr {
+	return func(m optionalAttr) {
+		m["unit"] = value
+	}
+}
+
+// Return substrings from `Tensor` of strings.
+//
+// For each string in the input `Tensor`, creates a substring starting at index
+// `pos` with a total length of `len`.
+//
+// If `len` defines a substring that would extend beyond the length of the input
+// string, then as many characters as possible are used.
+//
+// A negative `pos` indicates distance within the string backwards from the end.
+//
+// If `pos` specifies an index which is out of range for any of the input strings,
+// then an `InvalidArgumentError` is thrown.
+//
+// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
+// Op creation.
+//
+// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
+// broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+//
+// ---
+//
+// Examples
+//
+// Using scalar `pos` and `len`:
+//
+// ```python
+// input = [b'Hello', b'World']
+// position = 1
+// length = 3
+//
+// output = [b'ell', b'orl']
+// ```
+//
+// Using `pos` and `len` with same shape as `input`:
+//
+// ```python
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen']]
+// position = [[1, 2, 3],
+//             [1, 2, 3],
+//             [1, 2, 3]]
+// length =   [[2, 3, 4],
+//             [4, 3, 2],
+//             [5, 5, 5]]
+//
+// output = [[b'en', b'eve', b'lve'],
+//           [b'hirt', b'urt', b'te'],
+//           [b'ixtee', b'vente', b'hteen']]
+// ```
+//
+// Broadcasting `pos` and `len` onto `input`:
+//
+// ```
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen'],
+//          [b'nineteen', b'twenty', b'twentyone']]
+// position = [1, 2, 3]
+// length =   [1, 2, 3]
+//
+// output = [[b'e', b'ev', b'lve'],
+//           [b'h', b'ur', b'tee'],
+//           [b'i', b've', b'hte'],
+//           [b'i', b'en', b'nty']]
+// ```
+//
+// Broadcasting `input` onto `pos` and `len`:
+//
+// ```
+// input = b'thirteen'
+// position = [1, 5, 7]
+// length =   [3, 2, 1]
+//
+// output = [b'hir', b'ee', b'n']
+// ```
+//
+// Arguments:
+//	input: Tensor of strings
+//	pos: Scalar defining the position of first character in each substring
+//	len: Scalar defining the number of characters to include in each substring
+//
+// Returns Tensor of substrings
+func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output, optional ...SubstrAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Substr",
+		Input: []tf.Input{
+			input, pos, len,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Produce a string tensor that encodes the state of a Reader.
+//
+// Not all Readers support being serialized, so this can produce an
+// Unimplemented error.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderSerializeStateV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that concatenates `input_dataset` with `another_dataset`.
+func ConcatenateDataset(scope *Scope, input_dataset tf.Output, another_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ConcatenateDataset",
+		Input: []tf.Input{
+			input_dataset, another_dataset,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
+type MaxPool3DGradAttr func(optionalAttr)
+
+// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of max pooling function.
+//
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPool3DGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Return a slice from 'input'.
+//
+// The output tensor is a tensor with dimensions described by 'size'
+// whose values are extracted from 'input' starting at the offsets in
+// 'begin'.
+//
+// *Requirements*:
+//   0 <= begin[i] <= begin[i] + size[i] <= Di  for i in [0, n)
+//
+// Arguments:
+//
+//	begin: begin[i] specifies the offset into the 'i'th dimension of
+// 'input' to slice from.
+//	size: size[i] specifies the number of elements of the 'i'th dimension
+// of 'input' to slice. If size[i] is -1, all remaining elements in dimension
+// i are included in the slice (i.e. this is equivalent to setting
+// size[i] = input.dim_size(i) - begin[i]).
+func Slice(scope *Scope, input tf.Output, begin tf.Output, size tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Slice",
+		Input: []tf.Input{
+			input, begin, size,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
+type StatelessRandomNormalAttr func(optionalAttr)
+
+// StatelessRandomNormalDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom values from a normal distribution.
+//
+// The generated values will have mean 0 and standard deviation 1.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns Random values with specified shape.
+func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomNormal",
+		Input: []tf.Input{
+			shape, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Gets the next output from the given iterator .
+func IteratorGetNext(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "IteratorGetNext",
+		Input: []tf.Input{
+			iterator,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("IteratorGetNext", err)
+		return
+	}
+	return components
+}
+
+// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
+type ResourceScatterNdUpdateAttr func(optionalAttr)
+
+// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
+//
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Applies sparse `updates` to individual values or slices within a given
+//
+// variable according to `indices`.
+//
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// ```
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+// ```
+//
+// For example, say we want to update 4 scattered elements to a rank-1 tensor to
+// 8 elements. In Python, that update would look like this:
+//
+// ```python
+//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+//     indices = tf.constant([[4], [3], [1] ,[7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     update = tf.scatter_nd_update(ref, indices, updates)
+//     with tf.Session() as sess:
+//       print sess.run(update)
+// ```
+//
+// The resulting update to ref would look like this:
+//
+//     [1, 11, 3, 10, 9, 6, 7, 12]
+//
+// See `tf.scatter_nd` for more details about how to make updates to
+// slices.
+//
+// Arguments:
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of updated
+// values to add to ref.
+//
+// Returns the created operation.
+func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterNdUpdate",
+		Input: []tf.Input{
+			ref, indices, updates,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes the reciprocal of x element-wise.
+//
+// I.e., \\(y = 1 / x\\).
+func Inv(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Inv",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the maximum along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the max is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
+// </div>
+//
+// For example:
+//
+// ```
+// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+// tf.segment_max(c, tf.constant([0, 0, 1]))
+// # ==> [[4, 3, 3, 4],
+// #      [5, 6, 7, 8]]
+// ```
+//
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentMax",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the element-wise min of two SparseTensors.
+//
+// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
+//
+// Arguments:
+//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, in the canonical lexicographic ordering.
+//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
+//	a_shape: 1-D.  Shape of the input SparseTensor.
+//	b_indices: counterpart to `a_indices` for the other operand.
+//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
+//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+//
+// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
+func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSparseMinimum",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Reduces sparse updates into the variable referenced by `resource` using the `min` operation.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] = min(ref[indices, ...], updates[...])
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] = min(ref[indices[i], ...], updates[i, ...])
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] = min(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions are combined.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterMin(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterMin",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// 3D fast Fourier transform.
+//
+// Computes the 3-dimensional discrete Fourier transform over the inner-most 3
+// dimensions of `input`.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
+//   dimensions of `input` are replaced with their 3D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fftn with 3 dimensions.
+// @end_compatibility
+func FFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "FFT3D",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Inverse 3D fast Fourier transform.
+//
+// Computes the inverse 3-dimensional discrete Fourier transform over the
+// inner-most 3 dimensions of `input`.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
+//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifftn with 3 dimensions.
+// @end_compatibility
+func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IFFT3D",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that emits the lines of one or more text files.
+//
+// Arguments:
+//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
+// read.
+//	compression_type: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+//	buffer_size: A scalar containing the number of bytes to buffer.
+func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TextLineDataset",
+		Input: []tf.Input{
+			filenames, compression_type, buffer_size,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Performs gradient updates of embedding tables.
+//
+// Arguments:
+//	inputs: A TensorList of gradients with which to update embedding tables.
+// This argument has the same length and shapes as the return value of
+// RecvTPUEmbeddingActivations, but contains gradients of the model's loss
+// with respect to the embedding activations. The embedding tables are updated
+// from these gradients via the optimizer specified in the TPU embedding
+// configuration given to tpu.initialize_system.
+//	learning_rates: A TensorList of float32 scalars, one for each dynamic learning
+// rate tag: see the comments in
+// //third_party/tensorflow/core/protobuf/tpu/optimization_parameters.proto.
+// Multiple tables can share the same dynamic learning rate tag as specified
+// in the configuration. If the learning rates for all tables are constant,
+// this list should be empty.
+//	config: Serialized TPUEmbeddingConfiguration proto.
+//
+// Returns the created operation.
+func SendTPUEmbeddingGradients(scope *Scope, inputs []tf.Output, learning_rates []tf.Output, config string) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"config": config}
+	opspec := tf.OpSpec{
+		Type: "SendTPUEmbeddingGradients",
+		Input: []tf.Input{
+			tf.OutputList(inputs), tf.OutputList(learning_rates),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Inverse 3D real-valued fast Fourier transform.
+//
+// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 3 dimensions of `input`.
+//
+// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along each axis `IRFFT3D` is computed on, if `fft_length` (or
+// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+//
+// Returns A float32 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 3D real Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.irfftn with 3 dimensions.
+// @end_compatibility
+func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IRFFT3D",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Strip leading and trailing whitespaces from the Tensor.
+//
+// Arguments:
+//	input: A string `Tensor` of any shape.
+//
+// Returns A string `Tensor` of the same shape as the input.
+func StringStrip(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StringStrip",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
+type FixedLengthRecordReaderV2Attr func(optionalAttr)
+
+// FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
+//
+// value: Number of bytes in the header, defaults to 0.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["header_bytes"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
+//
+// value: Number of bytes in the footer, defaults to 0.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["footer_bytes"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
+//
+// value: Number of bytes to hop before each read. Default of 0 means using
+// record_bytes.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["hop_bytes"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2Container(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2Encoding sets the optional encoding attribute to value.
+//
+// value: The type of encoding for the file. Currently ZLIB and GZIP
+// are supported. Defaults to none.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2Encoding(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["encoding"] = value
+	}
+}
+
+// A Reader that outputs fixed-length records from a file.
+//
+// Arguments:
+//	record_bytes: Number of bytes in the record.
+//
+// Returns The handle to reference the Reader.
+func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"record_bytes": record_bytes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FixedLengthRecordReaderV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// PlaceholderAttr is an optional argument to Placeholder.
+type PlaceholderAttr func(optionalAttr)
+
+// PlaceholderShape sets the optional shape attribute to value.
+//
+// value: (Optional) The shape of the tensor. If the shape has 0 dimensions, the
+// shape is unconstrained.
+// If not specified, defaults to <unknown_rank:true >
+func PlaceholderShape(value tf.Shape) PlaceholderAttr {
+	return func(m optionalAttr) {
+		m["shape"] = value
+	}
+}
+
+// A placeholder op for a value that will be fed into the computation.
+//
+// N.B. This operation will fail with an error if it is executed. It is
+// intended as a way to represent a value that will always be fed, and to
+// provide attrs that enable the fed value to be checked at runtime.
+//
+// Arguments:
+//	dtype: The type of elements in the tensor.
+//
+// Returns A placeholder tensor that must be replaced using the feed mechanism.
+func Placeholder(scope *Scope, dtype tf.DataType, optional ...PlaceholderAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Placeholder",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
+type AvgPoolGradAttr func(optionalAttr)
+
+// AvgPoolGradDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of the average pooling function.
+//
+// Arguments:
+//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
+// the output of `avg_pool`.
+//	ksize: The size of the sliding window for each dimension of the input.
+//	strides: The stride of the sliding window for each dimension of the input.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
+func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AvgPoolGrad",
+		Input: []tf.Input{
+			orig_input_shape, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyAddSignAttr is an optional argument to ResourceApplyAddSign.
+type ResourceApplyAddSignAttr func(optionalAttr)
+
+// ResourceApplyAddSignUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAddSignUseLocking(value bool) ResourceApplyAddSignAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the AddSign update.
+//
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// update <- (alpha + sign_decay * sign(g) *sign(m)) * g
+// variable <- variable - lr_t * update
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	alpha: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, alpha tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyAddSignAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAddSign",
+		Input: []tf.Input{
+			var_, m, lr, alpha, sign_decay, beta, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes exponential of x element-wise.  \\(y = e^x\\).
+func Exp(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Exp",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Decode web-safe base64-encoded strings.
+//
+// Input may or may not have padding at the end. See EncodeBase64 for padding.
+// Web-safe means that input must use - and _ instead of + and /.
+//
+// Arguments:
+//	input: Base64 strings to decode.
+//
+// Returns Decoded strings.
+func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeBase64",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
+type ResourceSparseApplyAdagradAttr func(optionalAttr)
+
+// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceSparseApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
+// If not specified, defaults to true
+func ResourceSparseApplyAdagradUpdateSlots(value bool) ResourceSparseApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["update_slots"] = value
+	}
+}
+
+// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
+//
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyAdagrad",
+		Input: []tf.Input{
+			var_, accum, lr, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Converts the quantized `input` tensor into a lower-precision `output`.
+//
+// Converts the quantized `input` tensor into a lower-precision `output`, using the
+// output range specified with `requested_output_min` and `requested_output_max`.
+//
+// `[input_min, input_max]` are scalar floats that specify the range for the float
+// interpretation of the `input` data. For example, if `input_min` is -1.0f and
+// `input_max` is 1.0f, and we are dealing with `quint16` quantized data, then a 0
+// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+//
+// Arguments:
+//
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//	requested_output_min: The float value that the minimum quantized output value represents.
+//	requested_output_max: The float value that the maximum quantized output value represents.
+//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+//
+// Returns The requested_output_min value is copied into this output.The requested_output_max value is copied into this output.
+func Requantize(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	opspec := tf.OpSpec{
+		Type: "Requantize",
+		Input: []tf.Input{
+			input, input_min, input_max, requested_output_min, requested_output_max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// ExperimentalStatsAggregatorHandleAttr is an optional argument to ExperimentalStatsAggregatorHandle.
+type ExperimentalStatsAggregatorHandleAttr func(optionalAttr)
+
+// ExperimentalStatsAggregatorHandleContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func ExperimentalStatsAggregatorHandleContainer(value string) ExperimentalStatsAggregatorHandleAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// ExperimentalStatsAggregatorHandleSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func ExperimentalStatsAggregatorHandleSharedName(value string) ExperimentalStatsAggregatorHandleAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a statistics manager resource.
+func ExperimentalStatsAggregatorHandle(scope *Scope, optional ...ExperimentalStatsAggregatorHandleAttr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalStatsAggregatorHandle",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingAdadeltaParametersGradAccumDebug.
+type LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load Adadelta parameters with debug support.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the Adadelta optimization algorithm.
+//	accumulators: Value of accumulators used in the Adadelta optimization algorithm.
+//	updates: Value of updates used in the Adadelta optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the Adadelta optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingAdadeltaParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, updates tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingAdadeltaParametersGradAccumDebug",
+		Input: []tf.Input{
+			parameters, accumulators, updates, gradient_accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Creates a dataset that zips together `input_datasets`.
+func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ZipDataset",
+		Input: []tf.Input{
+			tf.OutputList(input_datasets),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes natural logarithm of x element-wise.
+//
+// I.e., \\(y = \log_e x\\).
+func Log(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Log",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StringLengthAttr is an optional argument to StringLength.
+type StringLengthAttr func(optionalAttr)
+
+// StringLengthUnit sets the optional unit attribute to value.
+//
+// value: The unit that is counted to compute string length.  One of: `"BYTE"` (for
+// the number of bytes in each string) or `"UTF8_CHAR"` (for the number of UTF-8
+// encoded Unicode code points in each string).  Results are undefined
+// if `unit=UTF8_CHAR` and the `input` strings do not contain structurally
+// valid UTF-8.
+// If not specified, defaults to "BYTE"
+func StringLengthUnit(value string) StringLengthAttr {
+	return func(m optionalAttr) {
+		m["unit"] = value
+	}
+}
+
+// String lengths of `input`.
+//
+// Computes the length of each string given in the input tensor.
+//
+// Arguments:
+//	input: The string for which to compute the length.
+//
+// Returns Integer tensor that has the same shape as `input`. The output contains the
+// element-wise string lengths of `input`.
+func StringLength(scope *Scope, input tf.Output, optional ...StringLengthAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringLength",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
+type ResizeNearestNeighborAttr func(optionalAttr)
+
+// ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// ResizeNearestNeighborHalfPixelCenters sets the optional half_pixel_centers attribute to value.
+// If not specified, defaults to false
+func ResizeNearestNeighborHalfPixelCenters(value bool) ResizeNearestNeighborAttr {
+	return func(m optionalAttr) {
+		m["half_pixel_centers"] = value
+	}
+}
+
+// Resize `images` to `size` using nearest neighbor interpolation.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeNearestNeighborAttr) (resized_images tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeNearestNeighbor",
+		Input: []tf.Input{
+			images, size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RandomCropAttr is an optional argument to RandomCrop.
+type RandomCropAttr func(optionalAttr)
+
+// RandomCropSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomCropSeed(value int64) RandomCropAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomCropSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomCropSeed2(value int64) RandomCropAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Randomly crop `image`.
+//
+// DEPRECATED at GraphDef version 8: Random crop is now pure Python
+//
+// `size` is a 1-D int64 tensor with 2 elements representing the crop height and
+// width.  The values must be non negative.
+//
+// This Op picks a random location in `image` and crops a `height` by `width`
+// rectangle from that location.  The random location is picked so the cropped
+// area will fit inside the original image.
+//
+// Arguments:
+//	image: 3-D of shape `[height, width, channels]`.
+//	size: 1-D of length 2 containing: `crop_height`, `crop_width`..
+//
+// Returns 3-D of shape `[crop_height, crop_width, channels].`
+func RandomCrop(scope *Scope, image tf.Output, size tf.Output, optional ...RandomCropAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomCrop",
+		Input: []tf.Input{
+			image, size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that emits the outputs of `input_dataset` `count` times.
+//
+// Arguments:
+//
+//	count: A scalar representing the number of times that `input_dataset` should
+// be repeated. A value of `-1` indicates that it should be repeated infinitely.
+//
+//
+func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "RepeatDataset",
+		Input: []tf.Input{
+			input_dataset, count,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizedConv2DPerChannelAttr is an optional argument to QuantizedConv2DPerChannel.
+type QuantizedConv2DPerChannelAttr func(optionalAttr)
+
+// QuantizedConv2DPerChannelOutType sets the optional out_type attribute to value.
+//
+// value: The quantized type of output tensor that needs to be converted.
+// If not specified, defaults to DT_QINT32
+func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChannelAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
+//
+// value: list of dilation values.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes QuantizedConv2D per channel.
+//
+// Arguments:
+//	input: The original input tensor.
+//	filter: The original filter tensor.
+//	min_input: The minimum value of the input tensor
+//	max_input: The maximum value of the input tensor.
+//	min_filter: The minimum value of the filter tensor.
+//	max_filter: The maximum value of the filter tensor.
+//	strides: list of stride values.
+//
+//
+// Returns The output tensor.The minimum value of the final output tensor.The maximum value of the final output tensor.
+func QuantizedConv2DPerChannel(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DPerChannelAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedConv2DPerChannel",
+		Input: []tf.Input{
+			input, filter, min_input, max_input, min_filter, max_filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// QuantizedDepthwiseConv2DWithBiasAttr is an optional argument to QuantizedDepthwiseConv2DWithBias.
+type QuantizedDepthwiseConv2DWithBiasAttr func(optionalAttr)
+
+// QuantizedDepthwiseConv2DWithBiasOutType sets the optional out_type attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_QINT32
+func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwiseConv2DWithBiasAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
+//
+// value: List of dilation values.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes quantized depthwise Conv2D with Bias.
+//
+// Arguments:
+//	input: The original input tensor.
+//	filter: The original filter tensor.
+//	bias: The original bias tensor.
+//	min_input: The float value that the minimum quantized input value represents.
+//	max_input: The float value that the maximum quantized input value represents.
+//	min_filter: The float value that the minimum quantized filter value represents.
+//	max_filter: The float value that the maximum quantized filter value represents.
+//	strides: List of stride values.
+//
+//
+// Returns The output tensor.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizedDepthwiseConv2DWithBias(scope *Scope, input tf.Output, filter tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedDepthwiseConv2DWithBiasAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedDepthwiseConv2DWithBias",
+		Input: []tf.Input{
+			input, filter, bias, min_input, max_input, min_filter, max_filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// LoadAndRemapMatrixAttr is an optional argument to LoadAndRemapMatrix.
+type LoadAndRemapMatrixAttr func(optionalAttr)
+
+// LoadAndRemapMatrixMaxRowsInMemory sets the optional max_rows_in_memory attribute to value.
+//
+// value: The maximum number of rows to load from the checkpoint at
+// once. If less than or equal to 0, the entire matrix will be loaded into
+// memory. Setting this arg trades increased disk reads for lower memory usage.
+// If not specified, defaults to -1
+func LoadAndRemapMatrixMaxRowsInMemory(value int64) LoadAndRemapMatrixAttr {
+	return func(m optionalAttr) {
+		m["max_rows_in_memory"] = value
+	}
+}
+
+// Loads a 2-D (matrix) `Tensor` with name `old_tensor_name` from the checkpoint
+//
+// at `ckpt_path` and potentially reorders its rows and columns using the
+// specified remappings.
+//
+// Most users should use one of the wrapper initializers (such as
+// `tf.contrib.framework.load_and_remap_matrix_initializer`) instead of this
+// function directly.
+//
+// The remappings are 1-D tensors with the following properties:
+//
+// * `row_remapping` must have exactly `num_rows` entries. Row `i` of the output
+//   matrix will be initialized from the row corresponding to index
+//   `row_remapping[i]` in the old `Tensor` from the checkpoint.
+// * `col_remapping` must have either 0 entries (indicating that no column
+//   reordering is needed) or `num_cols` entries. If specified, column `j` of the
+//   output matrix will be initialized from the column corresponding to index
+//   `col_remapping[j]` in the old `Tensor` from the checkpoint.
+// * A value of -1 in either of the remappings signifies a "missing" entry. In that
+//   case, values from the `initializing_values` tensor will be used to fill that
+//   missing row or column. If `row_remapping` has `r` missing entries and
+//   `col_remapping` has `c` missing entries, then the following condition must be
+//   true:
+//
+// `(r * num_cols) + (c * num_rows) - (r * c) == len(initializing_values)`
+//
+// The remapping tensors can be generated using the GenerateVocabRemapping op.
+//
+// As an example, with row_remapping = [1, 0, -1], col_remapping = [0, 2, -1],
+// initializing_values = [0.5, -0.5, 0.25, -0.25, 42], and w(i, j) representing
+// the value from row i, column j of the old tensor in the checkpoint, the output
+// matrix will look like the following:
+//
+// [[w(1, 0),  w(1, 2),  0.5],
+//  [w(0, 0),  w(0, 2), -0.5],
+//  [0.25,    -0.25,      42]]
+//
+// Arguments:
+//	ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) from
+// which the old matrix `Tensor` will be loaded.
+//	old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
+//	row_remapping: An int `Tensor` of row remappings (generally created by
+// `generate_vocab_remapping`).  Even if no row remapping is needed, this must
+// still be an index-valued Tensor (e.g. [0, 1, 2, ...]), or a shifted
+// index-valued `Tensor` (e.g. [8, 9, 10, ...], for partitioned `Variables`).
+//	col_remapping: An int `Tensor` of column remappings (generally created by
+// `generate_vocab_remapping`).  May be a size-0 `Tensor` if only row remapping
+// is to be done (e.g. column ordering is the same).
+//	initializing_values: A float `Tensor` containing  values to fill in for cells
+// in the output matrix that are not loaded from the checkpoint. Length must be
+// exactly the same as the number of missing / new cells.
+//	num_rows: Number of rows (length of the 1st dimension) in the output matrix.
+//	num_cols: Number of columns (length of the 2nd dimension) in the output matrix.
+//
+// Returns Output matrix containing existing values loaded from the
+// checkpoint, and with any missing values filled in from initializing_values.
+func LoadAndRemapMatrix(scope *Scope, ckpt_path tf.Output, old_tensor_name tf.Output, row_remapping tf.Output, col_remapping tf.Output, initializing_values tf.Output, num_rows int64, num_cols int64, optional ...LoadAndRemapMatrixAttr) (output_matrix tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_rows": num_rows, "num_cols": num_cols}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadAndRemapMatrix",
+		Input: []tf.Input{
+			ckpt_path, old_tensor_name, row_remapping, col_remapping, initializing_values,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizedDepthwiseConv2DWithBiasAndReluAttr is an optional argument to QuantizedDepthwiseConv2DWithBiasAndRelu.
+type QuantizedDepthwiseConv2DWithBiasAndReluAttr func(optionalAttr)
+
+// QuantizedDepthwiseConv2DWithBiasAndReluOutType sets the optional out_type attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_QINT32
+func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
+//
+// value: List of dilation values.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes quantized depthwise Conv2D with Bias and Relu.
+//
+// Arguments:
+//	input: The original input tensor.
+//	filter: The original filter tensor.
+//	bias: The original bias tensor.
+//	min_input: The float value that the minimum quantized input value represents.
+//	max_input: The float value that the maximum quantized input value represents.
+//	min_filter: The float value that the minimum quantized filter value represents.
+//	max_filter: The float value that the maximum quantized filter value represents.
+//	strides: List of stride values.
+//
+//
+// Returns The output tensor.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizedDepthwiseConv2DWithBiasAndRelu(scope *Scope, input tf.Output, filter tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedDepthwiseConv2DWithBiasAndReluAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedDepthwiseConv2DWithBiasAndRelu",
+		Input: []tf.Input{
+			input, filter, bias, min_input, max_input, min_filter, max_filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr is an optional argument to QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize.
+type QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr func(optionalAttr)
+
+// QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType sets the optional out_type attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_QUINT8
+func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataType) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
+//
+// value: List of dilation values.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes quantized depthwise Conv2D with Bias, Relu and Requantize.
+//
+// Arguments:
+//	input: The original input tensor.
+//	filter: The original filter tensor.
+//	bias: The original bias tensor.
+//	min_input: The float value that the minimum quantized input value represents.
+//	max_input: The float value that the maximum quantized input value represents.
+//	min_filter: The float value that the minimum quantized filter value represents.
+//	max_filter: The float value that the maximum quantized filter value represents.
+//	min_freezed_output: The minimum float value of the output tensor.
+//	max_freezed_output: The maximum float value of the output tensor.
+//	strides: List of stride values.
+//
+//
+// Returns The output tensor.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize(scope *Scope, input tf.Output, filter tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, min_freezed_output tf.Output, max_freezed_output tf.Output, strides []int64, padding string, optional ...QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize",
+		Input: []tf.Input{
+			input, filter, bias, min_input, max_input, min_filter, max_filter, min_freezed_output, max_freezed_output,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Copy a tensor setting everything outside a central band in each innermost matrix
+//
+// to zero.
+//
+// The `band` part is computed as follows:
+// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
+// tensor with the same shape where
+//
+// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
+//
+// The indicator function
+//
+// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
+//                  (num_upper < 0 || (n-m) <= num_upper)`.
+//
+// For example:
+//
+// ```
+// # if 'input' is [[ 0,  1,  2, 3]
+//                  [-1,  0,  1, 2]
+//                  [-2, -1,  0, 1]
+//                  [-3, -2, -1, 0]],
+//
+// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
+//                                        [-1,  0,  1, 2]
+//                                        [ 0, -1,  0, 1]
+//                                        [ 0,  0, -1, 0]],
+//
+// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
+//                                       [-1,  0,  1, 0]
+//                                       [-2, -1,  0, 1]
+//                                       [ 0, -2, -1, 0]]
+// ```
+//
+// Useful special cases:
+//
+// ```
+//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
+//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
+//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
+// ```
+//
+// Arguments:
+//	input: Rank `k` tensor.
+//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
+// lower triangle.
+//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
+// entire upper triangle.
+//
+// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
+func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixBandPart",
+		Input: []tf.Input{
+			input, num_lower, num_upper,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs a tensor containing the reduction across all input tensors.
+//
+// Outputs a tensor containing the reduction across all input tensors passed to ops
+// within the same `shared_name.
+//
+// The graph should be constructed so if one op runs with shared_name value `c`,
+// then `num_devices` ops will run with shared_name value `c`.  Failure to do so
+// will cause the graph execution to fail to complete.
+//
+// input: the input to the reduction
+// data: the value of the reduction across all `num_devices` devices.
+// reduction: the reduction operation to perform.
+// num_devices: The number of devices participating in this reduction.
+// shared_name: Identifier that shared between ops of the same reduction.
+func NcclAllReduce(scope *Scope, input tf.Output, reduction string, num_devices int64, shared_name string) (data tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"reduction": reduction, "num_devices": num_devices, "shared_name": shared_name}
+	opspec := tf.OpSpec{
+		Type: "NcclAllReduce",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reduces `input` from `num_devices` using `reduction` to a single device.
+//
+// Reduces `input` from `num_devices` using `reduction` to a single device.
+//
+// The graph should be constructed so that all inputs have a valid device
+// assignment, and the op itself is assigned one of these devices.
+//
+// input: The input to the reduction.
+// data: the value of the reduction across all `num_devices` devices.
+// reduction: the reduction operation to perform.
+func NcclReduce(scope *Scope, input []tf.Output, reduction string) (data tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"reduction": reduction}
+	opspec := tf.OpSpec{
+		Type: "NcclReduce",
+		Input: []tf.Input{
+			tf.OutputList(input),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
+type ResourceApplyAdagradDAAttr func(optionalAttr)
+
+// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the proximal adagrad scheme.
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdagradDA",
+		Input: []tf.Input{
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Saves tensors in V2 checkpoint format.
+//
+// By default, saves the named tensors in full.  If the caller wishes to save
+// specific slices of full tensors, "shape_and_slices" should be non-empty strings
+// and correspondingly well-formed.
+//
+// Arguments:
+//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
+// write the tensors.
+//	tensor_names: shape {N}. The names of the tensors to be saved.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
+// Empty strings indicate that they are non-partitioned tensors.
+//	tensors: `N` tensors to save.
+//
+// Returns the created operation.
+func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SaveV2",
+		Input: []tf.Input{
+			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
+//
+// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
+// `N` is the minibatch size and the rows correspond to packed outputs of
+// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
+// must all match.  When the final `SparseTensor` is created, it has rank one
+// higher than the ranks of the incoming `SparseTensor` objects
+// (they have been concatenated along a new row dimension).
+//
+// The output `SparseTensor` object's shape values for all dimensions but the
+// first are the max across the input `SparseTensor` objects' shape values
+// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+// size.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
+//
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+//
+// and
+//
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+//
+// then the final deserialized `SparseTensor` will be:
+//
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+//
+// Arguments:
+//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
+// Must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "DeserializeManySparse",
+		Input: []tf.Input{
+			serialized_sparse,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Sends `input` to all devices that are connected to the output.
+//
+// Sends `input` to all devices that are connected to the output.
+//
+// The graph should be constructed so that all ops connected to the output have a
+// valid device assignment, and the op itself is assigned one of these devices.
+//
+// input: The input to the broadcast.
+// output: The same as input.
+// shape: The shape of the input tensor.
+//
+func NcclBroadcast(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shape": shape}
+	opspec := tf.OpSpec{
+		Type: "NcclBroadcast",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EnqueueTPUEmbeddingSparseTensorBatchAttr is an optional argument to EnqueueTPUEmbeddingSparseTensorBatch.
+type EnqueueTPUEmbeddingSparseTensorBatchAttr func(optionalAttr)
+
+// EnqueueTPUEmbeddingSparseTensorBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. Should be >= 0 and less than the number
+// of TPU cores in the task on which the node is placed.
+// If not specified, defaults to -1
+func EnqueueTPUEmbeddingSparseTensorBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingSparseTensorBatchAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// EnqueueTPUEmbeddingSparseTensorBatchCombiners sets the optional combiners attribute to value.
+//
+// value: A list of string scalars, one for each embedding table that specify
+// how to normalize the embedding activations after weighted summation.
+// Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
+// the sum of the weights be 0 for 'mean' or the sum of the squared weights be
+// 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
+// all tables.
+// If not specified, defaults to <>
+func EnqueueTPUEmbeddingSparseTensorBatchCombiners(value []string) EnqueueTPUEmbeddingSparseTensorBatchAttr {
+	return func(m optionalAttr) {
+		m["combiners"] = value
+	}
+}
+
+// EnqueueTPUEmbeddingSparseTensorBatchMaxSequenceLengths sets the optional max_sequence_lengths attribute to value.
+// If not specified, defaults to <>
+func EnqueueTPUEmbeddingSparseTensorBatchMaxSequenceLengths(value []int64) EnqueueTPUEmbeddingSparseTensorBatchAttr {
+	return func(m optionalAttr) {
+		m["max_sequence_lengths"] = value
+	}
+}
+
+// Eases the porting of code that uses tf.nn.embedding_lookup_sparse().
+//
+// sample_indices[i], embedding_indices[i] and aggregation_weights[i] correspond
+// to the ith feature. table_ids[i] indicates which embedding table to look up ith
+// feature.
+//
+// The tensors at corresponding positions in the three input lists (sample_indices,
+// embedding_indices and aggregation_weights) must have the same shape, i.e. rank 1
+// with dim_size() equal to the total number of lookups into the table described by
+// the corresponding feature.
+//
+// Arguments:
+//	sample_indices: A list of rank 1 Tensors specifying the training example to
+// which the corresponding embedding_indices and aggregation_weights values
+// belong. It corresponds to sp_ids.indices[:,0] in  embedding_lookup_sparse().
+//	embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
+// It corresponds to sp_ids.values in embedding_lookup_sparse().
+//	aggregation_weights: A list of rank 1 Tensors containing per training example
+// aggregation weights. It corresponds to sp_weights.values in
+// embedding_lookup_sparse().
+//	mode_override: A string input that overrides the mode specified in the
+// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
+//	table_ids: A list of integers specifying the identifier of the embedding table
+// (offset of TableDescriptor in the TPUEmbeddingConfiguration) to lookup the
+// corresponding input. The ith input is looked up using table_ids[i]. The size
+// of the table_ids list must be equal to that of sample_indices,
+// embedding_indices and aggregation_weights.
+//
+// Returns the created operation.
+func EnqueueTPUEmbeddingSparseTensorBatch(scope *Scope, sample_indices []tf.Output, embedding_indices []tf.Output, aggregation_weights []tf.Output, mode_override tf.Output, table_ids []int64, optional ...EnqueueTPUEmbeddingSparseTensorBatchAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"table_ids": table_ids}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EnqueueTPUEmbeddingSparseTensorBatch",
+		Input: []tf.Input{
+			tf.OutputList(sample_indices), tf.OutputList(embedding_indices), tf.OutputList(aggregation_weights), mode_override,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Add all input tensors element wise.
+//
+// Arguments:
+//	inputs: Must all be the same size and shape.
+func AddN(scope *Scope, inputs []tf.Output) (sum tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AddN",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AbortAttr is an optional argument to Abort.
+type AbortAttr func(optionalAttr)
+
+// AbortErrorMsg sets the optional error_msg attribute to value.
+//
+// value: A string which is the message associated with the exception.
+// If not specified, defaults to ""
+func AbortErrorMsg(value string) AbortAttr {
+	return func(m optionalAttr) {
+		m["error_msg"] = value
+	}
+}
+
+// AbortExitWithoutError sets the optional exit_without_error attribute to value.
+// If not specified, defaults to false
+func AbortExitWithoutError(value bool) AbortAttr {
+	return func(m optionalAttr) {
+		m["exit_without_error"] = value
+	}
+}
+
+// Raise a exception to abort the process when called.
+//
+// If exit_without_error is true, the process will exit normally,
+// otherwise it will exit with a SIGABORT signal.
+//
+// Returns nothing but an exception.
+//
+// Returns the created operation.
+func Abort(scope *Scope, optional ...AbortAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Abort",
+
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns the element-wise sum of a list of tensors.
+//
+// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
+// wait for all of its inputs to be ready before beginning to sum. This can
+// save memory if inputs are ready at different times, since minimum temporary
+// storage is proportional to the output size rather than the inputs size.
+//
+// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
+//
+// Returns a `Tensor` of same shape and type as the elements of `inputs`.
+//
+// Arguments:
+//	inputs: A list of `Tensor` objects, each with same shape and type.
+//	shape: Shape of elements of `inputs`.
+func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shape": shape}
+	opspec := tf.OpSpec{
+		Type: "AccumulateNV2",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// A container for an iterator resource.
+//
+// Returns A handle to the iterator that can be passed to a "MakeIterator" or
+// "IteratorGetNext" op. In contrast to Iterator, AnonymousIterator prevents
+// resource sharing by name, and does not keep a reference to the resource
+// container.A variant deleter that should be passed into the op that deletes the iterator.
+func AnonymousIteratorV2(scope *Scope, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output, deleter tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "AnonymousIteratorV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// TridiagonalSolveAttr is an optional argument to TridiagonalSolve.
+type TridiagonalSolveAttr func(optionalAttr)
+
+// TridiagonalSolvePartialPivoting sets the optional partial_pivoting attribute to value.
+//
+// value: Whether to apply partial pivoting. Partial pivoting makes the procedure more
+// stable, but slower.
+// If not specified, defaults to true
+func TridiagonalSolvePartialPivoting(value bool) TridiagonalSolveAttr {
+	return func(m optionalAttr) {
+		m["partial_pivoting"] = value
+	}
+}
+
+// Solves tridiagonal systems of equations.
+//
+//   Solves tridiagonal systems of equations.
+//   Supports batch dimensions and multiple right-hand sides per each left-hand
+//   side.
+//   On CPU, solution is computed via Gaussian elimination with or without partial
+//   pivoting, depending on `partial_pivoting` attribute. On GPU, Nvidia's cuSPARSE
+//   library is used: https://docs.nvidia.com/cuda/cusparse/index.html#gtsv
+//
+// Arguments:
+//	diagonals: Tensor of shape `[..., 3, M]` whose innermost 2 dimensions represent the
+// tridiagonal matrices with three rows being the superdiagonal, diagonals, and
+// subdiagonals, in order. The last element of the superdiagonal and the first
+// element of the subdiagonal is ignored.
+//	rhs: Tensor of shape `[..., M, K]`, representing K right-hand sides per each
+// left-hand side.
+//
+// Returns Tensor of shape `[..., M, K]` containing the solutions
+func TridiagonalSolve(scope *Scope, diagonals tf.Output, rhs tf.Output, optional ...TridiagonalSolveAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TridiagonalSolve",
+		Input: []tf.Input{
+			diagonals, rhs,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Assigns a new value to a variable.
+//
+// Any ReadVariableOp with a control dependency on this op is guaranteed to return
+// this value or a subsequent newer value of the variable.
+//
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	value: the value to set the new tensor to use.
+//
+// Returns the created operation.
+func AssignVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AssignVariableOp",
+		Input: []tf.Input{
+			resource, value,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// TruncatedNormalAttr is an optional argument to TruncatedNormal.
+type TruncatedNormalAttr func(optionalAttr)
+
+// TruncatedNormalSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a truncated normal distribution.
+//
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
+//
+// Returns A tensor of the specified shape filled with random truncated normal
+// values.
+func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TruncatedNormal",
+		Input: []tf.Input{
+			shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg.
+type DecodeAndCropJpegAttr func(optionalAttr)
+
+// DecodeAndCropJpegChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// DecodeAndCropJpegRatio sets the optional ratio attribute to value.
+//
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
+	}
+}
+
+// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+//
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
+	}
+}
+
+// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+//
+// value: If true try to recover an image from truncated input.
+// If not specified, defaults to false
+func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["try_recover_truncated"] = value
+	}
+}
+
+// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+//
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["acceptable_fraction"] = value
+	}
+}
+
+// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value.
+//
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["dct_method"] = value
+	}
+}
+
+// Decode and Crop a JPEG-encoded image to a uint8 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
+//
+//
+// It is equivalent to a combination of decode and crop, but much faster by only
+// decoding partial jpeg image.
+//
+// Arguments:
+//	contents: 0-D.  The JPEG-encoded image.
+//	crop_window: 1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
+//
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeAndCropJpeg",
+		Input: []tf.Input{
+			contents, crop_window,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// BatchMatMulAttr is an optional argument to BatchMatMul.
+type BatchMatMulAttr func(optionalAttr)
+
+// BatchMatMulAdjX sets the optional adj_x attribute to value.
+//
+// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulAdjX(value bool) BatchMatMulAttr {
+	return func(m optionalAttr) {
+		m["adj_x"] = value
+	}
+}
+
+// BatchMatMulAdjY sets the optional adj_y attribute to value.
+//
+// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulAdjY(value bool) BatchMatMulAttr {
+	return func(m optionalAttr) {
+		m["adj_y"] = value
+	}
+}
+
+// Multiplies slices of two tensors in batches.
+//
+// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
+// viewed as an element of a batch), and arranges the individual results
+// in a single output tensor of the same batch size. Each of the
+// individual slices can optionally be adjointed (to adjoint a matrix
+// means to transpose and conjugate it) before multiplication by setting
+// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
+//
+// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
+// and `[..., r_y, c_y]`.
+//
+// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
+//
+//     r_o = c_x if adj_x else r_x
+//     c_o = r_y if adj_y else c_y
+//
+// It is computed as:
+//
+//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
+//
+// Arguments:
+//	x: 2-D or higher with shape `[..., r_x, c_x]`.
+//	y: 2-D or higher with shape `[..., r_y, c_y]`.
+//
+// Returns 3-D or higher with shape `[..., r_o, c_o]`
+func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "BatchMatMul",
+		Input: []tf.Input{
+			x, y,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// BatchMatMulV2Attr is an optional argument to BatchMatMulV2.
+type BatchMatMulV2Attr func(optionalAttr)
+
+// BatchMatMulV2AdjX sets the optional adj_x attribute to value.
+//
+// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulV2AdjX(value bool) BatchMatMulV2Attr {
+	return func(m optionalAttr) {
+		m["adj_x"] = value
+	}
+}
+
+// BatchMatMulV2AdjY sets the optional adj_y attribute to value.
+//
+// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulV2AdjY(value bool) BatchMatMulV2Attr {
+	return func(m optionalAttr) {
+		m["adj_y"] = value
+	}
+}
+
+// Multiplies slices of two tensors in batches.
+//
+// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
+// viewed as an element of a batch), and arranges the individual results
+// in a single output tensor of the same batch size. Each of the
+// individual slices can optionally be adjointed (to adjoint a matrix
+// means to transpose and conjugate it) before multiplication by setting
+// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
+//
+// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
+// and `[..., r_y, c_y]`.
+//
+// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
+//
+//     r_o = c_x if adj_x else r_x
+//     c_o = r_y if adj_y else c_y
+//
+// It is computed as:
+//
+//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
+//
+// *NOTE*: `BatchMatMulV2` supports broadcasting in the batch dimensions. More
+// about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html).
+//
+//
+// Arguments:
+//	x: 2-D or higher with shape `[..., r_x, c_x]`.
+//	y: 2-D or higher with shape `[..., r_y, c_y]`.
+//
+// Returns 3-D or higher with shape `[..., r_o, c_o]`
+func BatchMatMulV2(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "BatchMatMulV2",
+		Input: []tf.Input{
+			x, y,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the matrix logarithm of one or more square matrices:
+//
+//
+// \\(log(exp(A)) = A\\)
+//
+// This op is only defined for complex matrices. If A is positive-definite and
+// real, then casting to a complex matrix, taking the logarithm and casting back
+// to a real matrix will give the correct result.
+//
+// This function computes the matrix logarithm using the Schur-Parlett algorithm.
+// Details of the algorithm can be found in Section 11.6.2 of:
+// Nicholas J. Higham, Functions of Matrices: Theory and Computation, SIAM 2008.
+// ISBN 978-0-898716-46-7.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the exponential for all input submatrices `[..., :, :]`.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(scipy)
+// Equivalent to scipy.linalg.logm
+// @end_compatibility
+func MatrixLogarithm(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixLogarithm",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EncodeBase64Attr is an optional argument to EncodeBase64.
+type EncodeBase64Attr func(optionalAttr)
+
+// EncodeBase64Pad sets the optional pad attribute to value.
+//
+// value: Bool whether padding is applied at the ends.
+// If not specified, defaults to false
+func EncodeBase64Pad(value bool) EncodeBase64Attr {
+	return func(m optionalAttr) {
+		m["pad"] = value
+	}
+}
+
+// Encode strings into web-safe base64 format.
+//
+// Refer to the following article for more information on base64 format:
+// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
+// end so that the encoded has length multiple of 4. See Padding section of the
+// link above.
+//
+// Web-safe means that the encoder uses - and _ instead of + and /.
+//
+// Arguments:
+//	input: Strings to be encoded.
+//
+// Returns Input strings encoded in base64.
+func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EncodeBase64",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ComplexAbsAttr is an optional argument to ComplexAbs.
+type ComplexAbsAttr func(optionalAttr)
+
+// ComplexAbsTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Computes the complex absolute value of a tensor.
+//
+// Given a tensor `x` of complex numbers, this operation returns a tensor of type
+// `float` or `double` that is the absolute value of each element in `x`. All
+// elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
+// value is computed as \\( \sqrt{a^2 + b^2}\\).
+func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ComplexAbs",
+		Input: []tf.Input{
+			x,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LuAttr is an optional argument to Lu.
+type LuAttr func(optionalAttr)
+
+// LuOutputIdxType sets the optional output_idx_type attribute to value.
+// If not specified, defaults to DT_INT32
+func LuOutputIdxType(value tf.DataType) LuAttr {
+	return func(m optionalAttr) {
+		m["output_idx_type"] = value
+	}
+}
+
+// Computes the LU decomposition of one or more square matrices.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices.
+//
+// The input has to be invertible.
+//
+// The output consists of two tensors LU and P containing the LU decomposition
+// of all input submatrices `[..., :, :]`. LU encodes the lower triangular and
+// upper triangular factors.
+//
+// For each input submatrix of shape `[M, M]`, L is a lower triangular matrix of
+// shape `[M, M]` with unit diagonal whose entries correspond to the strictly lower
+// triangular part of LU. U is a upper triangular matrix of shape `[M, M]` whose
+// entries correspond to the upper triangular part, including the diagonal, of LU.
+//
+// P represents a permutation matrix encoded as a list of indices each between `0`
+// and `M-1`, inclusive. If P_mat denotes the permutation matrix corresponding to
+// P, then the L, U and P satisfies P_mat * input = L * U.
+//
+// Arguments:
+//	input: A tensor of shape `[..., M, M]` whose inner-most 2 dimensions form matrices of
+// size `[M, M]`.
+//
+// Returns A tensor of shape `[..., M, M]` whose strictly lower triangular part denotes the
+// lower triangular factor `L` with unit diagonal, and whose upper triangular part
+// denotes the upper triangular factor `U`.Permutation of the rows encoded as a list of indices in `0..M-1`. Shape is
+// `[..., M]`.
+// @compatibility(scipy)
+// Similar to `scipy.linalg.lu`, except the triangular factors `L` and `U` are
+// packed into a single tensor, the permutation is applied to `input` instead of
+// the right hand side and the permutation `P` is returned as a list of indices
+// instead of a permutation matrix.
+// @end_compatibility
+func Lu(scope *Scope, input tf.Output, optional ...LuAttr) (lu tf.Output, p tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Lu",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// ResourceApplyProximalGradientDescentAttr is an optional argument to ResourceApplyProximalGradientDescent.
+type ResourceApplyProximalGradientDescentAttr func(optionalAttr)
+
+// ResourceApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyProximalGradientDescentUseLocking(value bool) ResourceApplyProximalGradientDescentAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' as FOBOS algorithm with fixed learning rate.
+//
+// prox_v = var - alpha * delta
+// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	delta: The change.
+//
+// Returns the created operation.
+func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, delta tf.Output, optional ...ResourceApplyProximalGradientDescentAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyProximalGradientDescent",
+		Input: []tf.Input{
+			var_, alpha, l1, l2, delta,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes numerical negative value element-wise.
+//
+// I.e., \\(y = -x\\).
+func Neg(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Neg",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
@@ -17771,12 +30921,12 @@ func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...Va
 //
 // Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
 // is the corresponding input gradient.
-func InvGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "InvGrad",
+		Type: "ReciprocalGrad",
 		Input: []tf.Input{
 			y, dy,
 		},
@@ -17785,95 +30935,44 @@ func InvGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// ReverseSequenceAttr is an optional argument to ReverseSequence.
-type ReverseSequenceAttr func(optionalAttr)
+// MaxAttr is an optional argument to Max.
+type MaxAttr func(optionalAttr)
 
-// ReverseSequenceBatchDim sets the optional batch_dim attribute to value.
+// MaxKeepDims sets the optional keep_dims attribute to value.
 //
-// value: The dimension along which reversal is performed.
-// If not specified, defaults to 0
-func ReverseSequenceBatchDim(value int64) ReverseSequenceAttr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MaxKeepDims(value bool) MaxAttr {
 	return func(m optionalAttr) {
-		m["batch_dim"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Reverses variable length slices.
+// Computes the maximum of elements across dimensions of a tensor.
 //
-// This op first slices `input` along the dimension `batch_dim`, and for each
-// slice `i`, reverses the first `seq_lengths[i]` elements along
-// the dimension `seq_dim`.
-//
-// The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
-// and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
-//
-// The output slice `i` along dimension `batch_dim` is then given by input
-// slice `i`, with the first `seq_lengths[i]` slices along dimension
-// `seq_dim` reversed.
-//
-// For example:
-//
-// ```
-// # Given this:
-// batch_dim = 0
-// seq_dim = 1
-// input.dims = (4, 8, ...)
-// seq_lengths = [7, 2, 3, 5]
-//
-// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
-// output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
-// output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
-// output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
-// output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
-//
-// # while entries past seq_lens are copied through:
-// output[0, 7:, :, ...] = input[0, 7:, :, ...]
-// output[1, 2:, :, ...] = input[1, 2:, :, ...]
-// output[2, 3:, :, ...] = input[2, 3:, :, ...]
-// output[3, 2:, :, ...] = input[3, 2:, :, ...]
-// ```
-//
-// In contrast, if:
-//
-// ```
-// # Given this:
-// batch_dim = 2
-// seq_dim = 0
-// input.dims = (8, ?, 4, ...)
-// seq_lengths = [7, 2, 3, 5]
-//
-// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
-// output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
-// output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
-// output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
-// output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
-//
-// # while entries past seq_lens are copied through:
-// output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
-// output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
-// output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
-// output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
-// ```
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	input: The input to reverse.
-//	seq_lengths: 1-D with length `input.dims(batch_dim)` and
-// `max(seq_lengths) <= input.dims(seq_dim)`
-//	seq_dim: The dimension which is partially reversed.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns The partially reversed input. It has the same shape as `input`.
-func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_dim int64, optional ...ReverseSequenceAttr) (output tf.Output) {
+// Returns The reduced tensor.
+func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"seq_dim": seq_dim}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ReverseSequence",
+		Type: "Max",
 		Input: []tf.Input{
-			input, seq_lengths,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
@@ -17881,23 +30980,89 @@ func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_d
 	return op.Output(0)
 }
 
-// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
+// Rounds the values of a tensor to the nearest integer, element-wise.
 //
-// Arguments:
-//	serialized: A scalar string containing a serialized TensorProto proto.
-//	out_type: The type of the serialized tensor.  The provided type must match the
-// type of the serialized tensor and no implicit conversion will take place.
-//
-// Returns A Tensor of type `out_type`.
-func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
+// Rounds half to even.  Also known as bankers rounding. If you want to round
+// according to the current system rounding mode use std::cint.
+func Round(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "ParseTensor",
+		Type: "Round",
 		Input: []tf.Input{
-			serialized,
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradient for the rsqrt of `x` wrt its input.
+//
+// Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func RsqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RsqrtGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResizeAreaAttr is an optional argument to ResizeArea.
+type ResizeAreaAttr func(optionalAttr)
+
+// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Resize `images` to `size` using area interpolation.
+//
+// Input images can be of different types but output images are always float.
+//
+// The range of pixel values for the output image might be slightly different
+// from the range for the input image because of limited numerical precision.
+// To guarantee an output range, for example `[0.0, 1.0]`, apply
+// `tf.clip_by_value` to the output.
+//
+// Each output pixel is computed by first transforming the pixel's footprint into
+// the input tensor and then averaging the pixels that intersect the footprint. An
+// input pixel's contribution to the average is weighted by the fraction of its
+// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeArea",
+		Input: []tf.Input{
+			images, size,
 		},
 		Attrs: attrs,
 	}
@@ -17905,6 +31070,1549 @@ func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (outp
 	return op.Output(0)
 }
 
+// LowerBoundAttr is an optional argument to LowerBound.
+type LowerBoundAttr func(optionalAttr)
+
+// LowerBoundOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func LowerBoundOutType(value tf.DataType) LowerBoundAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Applies lower_bound(sorted_search_values, values) along each row.
+//
+// Each set of rows with the same index in (sorted_inputs, values) is treated
+// independently.  The resulting row is the equivalent of calling
+// `np.searchsorted(sorted_inputs, values, side='left')`.
+//
+// The result is not a global index to the entire
+// `Tensor`, but rather just the index in the last dimension.
+//
+// A 2-D example:
+//   sorted_sequence = [[0, 3, 9, 9, 10],
+//                      [1, 2, 3, 4, 5]]
+//   values = [[2, 4, 9],
+//             [0, 2, 6]]
+//
+//   result = LowerBound(sorted_sequence, values)
+//
+//   result == [[1, 2, 2],
+//              [0, 1, 5]]
+//
+// Arguments:
+//	sorted_inputs: 2-D Tensor where each row is ordered.
+//	values: 2-D Tensor with the same numbers of rows as `sorted_search_values`. Contains
+// the values that will be searched for in `sorted_search_values`.
+//
+// Returns A `Tensor` with the same shape as `values`.  It contains the first scalar index
+// into the last dimension where values can be inserted without changing the
+// ordered property.
+func LowerBound(scope *Scope, sorted_inputs tf.Output, values tf.Output, optional ...LowerBoundAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LowerBound",
+		Input: []tf.Input{
+			sorted_inputs, values,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingFTRLParametersGradAccumDebug.
+type RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve FTRL embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the FTRL optimization algorithm.Parameter accumulators updated by the FTRL optimization algorithm.Parameter linears updated by the FTRL optimization algorithm.Parameter gradient_accumulators updated by the FTRL optimization algorithm.
+func RetrieveTPUEmbeddingFTRLParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, linears tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingFTRLParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// Convert one or more images from HSV to RGB.
+//
+// Outputs a tensor of the same shape as the `images` tensor, containing the RGB
+// value of the pixels. The output is only well defined if the value in `images`
+// are in `[0,1]`.
+//
+// See `rgb_to_hsv` for a description of the HSV encoding.
+//
+// Arguments:
+//	images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
+//
+// Returns `images` converted to RGB.
+func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "HSVToRGB",
+		Input: []tf.Input{
+			images,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes exponential of x - 1 element-wise.
+//
+// I.e., \\(y = (\exp x) - 1\\).
+func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Expm1",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPoolAttr is an optional argument to MaxPool.
+type MaxPoolAttr func(optionalAttr)
+
+// MaxPoolDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolDataFormat(value string) MaxPoolAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs max pooling on the input.
+//
+// Arguments:
+//	input: 4-D input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The max pooled output tensor.
+func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPool",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes natural logarithm of (1 + x) element-wise.
+//
+// I.e., \\(y = \log_e (1 + x)\\).
+func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Log1p",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
+//
+// The Hurwitz zeta function is defined as:
+//
+//
+// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
+func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Zeta",
+		Input: []tf.Input{
+			x, q,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes hyperbolic tangent of `x` element-wise.
+func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Tanh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Convert JSON-encoded Example records to binary protocol buffer strings.
+//
+// This op translates a tensor containing Example records, encoded using
+// the [standard JSON
+// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
+// into a tensor containing the same records encoded as binary protocol
+// buffers. The resulting tensor can then be fed to any of the other
+// Example-parsing ops.
+//
+// Arguments:
+//	json_examples: Each string is a JSON object serialized according to the JSON
+// mapping of the Example proto.
+//
+// Returns Each string is a binary Example protocol buffer corresponding
+// to the respective element of `json_examples`.
+func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeJSONExample",
+		Input: []tf.Input{
+			json_examples,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes inverse hyperbolic sine of x element-wise.
+func Asinh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Asinh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
+type NonMaxSuppressionAttr func(optionalAttr)
+
+// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
+//
+// value: A float representing the threshold for deciding whether boxes
+// overlap too much with respect to IOU.
+// If not specified, defaults to 0.5
+func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
+	return func(m optionalAttr) {
+		m["iou_threshold"] = value
+	}
+}
+
+// Greedily selects a subset of bounding boxes in descending order of score,
+//
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system.  Note that this
+// algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//   selected_indices = tf.image.non_max_suppression(
+//       boxes, scores, max_output_size, iou_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
+//
+// Arguments:
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "NonMaxSuppression",
+		Input: []tf.Input{
+			boxes, scores, max_output_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradient for the tanh of `x` wrt its input.
+//
+// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
+// is the corresponding input gradient.
+func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TanhGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyAdamWithAmsgradAttr is an optional argument to ResourceApplyAdamWithAmsgrad.
+type ResourceApplyAdamWithAmsgradAttr func(optionalAttr)
+
+// ResourceApplyAdamWithAmsgradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdamWithAmsgradUseLocking(value bool) ResourceApplyAdamWithAmsgradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the Adam algorithm.
+//
+// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+// $$vhat_t := max{vhat_{t-1}, v_t}$$
+// $$variable := variable - lr_t * m_t / (\sqrt{vhat_t} + \epsilon)$$
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	vhat: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	beta2_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdamWithAmsgrad(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, vhat tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamWithAmsgradAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdamWithAmsgrad",
+		Input: []tf.Input{
+			var_, m, v, vhat, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes Psi, the derivative of Lgamma (the log of the absolute value of
+//
+// `Gamma(x)`), element-wise.
+func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Digamma",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
+//
+// For an explanation see "Differentiation of the Cholesky algorithm" by
+// Iain Murray http://arxiv.org/abs/1602.07527.
+//
+// Arguments:
+//	l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
+//	grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
+//
+// Returns Symmetrized version of df/dA . Shape is `[..., M, M]`
+func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "CholeskyGrad",
+		Input: []tf.Input{
+			l, grad,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes cos of x element-wise.
+func Cos(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cos",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the trignometric inverse tangent of x element-wise.
+//
+// The `tf.math.atan` operation returns the inverse of `tf.math.tan`, such that
+// if `y = tf.math.tan(x)` then, `x = tf.math.atan(y)`.
+//
+// **Note**: The output of `tf.math.atan` will lie within the invertible range
+// of tan, i.e (-pi/2, pi/2).
+//
+// For example:
+//
+// ```python
+// # Note: [1.047, 0.785] ~= [(pi/3), (pi/4)]
+// x = tf.constant([1.047, 0.785])
+// y = tf.math.tan(x) # [1.731261, 0.99920404]
+//
+// tf.math.atan(y) # [1.047, 0.785] = x
+// ```
+//
+func Atan(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Atan",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Conv3DBackpropFilterV2Attr is an optional argument to Conv3DBackpropFilterV2.
+type Conv3DBackpropFilterV2Attr func(optionalAttr)
+
+// Conv3DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv3DBackpropFilterV2Dilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the filter.
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 5-D
+// `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
+// tensor.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv3DBackpropFilterV2",
+		Input: []tf.Input{
+			input, filter_sizes, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns which elements of x are Inf.
+//
+// @compatibility(numpy)
+// Equivalent to np.isinf
+// @end_compatibility
+func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IsInf",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns which elements of x are finite.
+//
+// @compatibility(numpy)
+// Equivalent to np.isfinite
+// @end_compatibility
+func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IsFinite",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Generates sparse cross from a list of sparse and dense tensors.
+//
+// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+// representing features of one feature column. It outputs a 2D `SparseTensor` with
+// the batchwise crosses of these features.
+//
+// For example, if the inputs are
+//
+//     inputs[0]: SparseTensor with shape = [2, 2]
+//     [0, 0]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     inputs[1]: SparseTensor with shape = [2, 1]
+//     [0, 0]: "d"
+//     [1, 0]: "e"
+//
+//     inputs[2]: Tensor [["f"], ["g"]]
+//
+// then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: "a_X_d_X_f"
+//     [1, 0]: "b_X_e_X_g"
+//     [1, 1]: "c_X_e_X_g"
+//
+// if hashed_output=true then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: FingerprintCat64(
+//                 Fingerprint64("f"), FingerprintCat64(
+//                     Fingerprint64("d"), Fingerprint64("a")))
+//     [1, 0]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("b")))
+//     [1, 1]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("c")))
+//
+// Arguments:
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.   values of each `SparseTensor`.
+//	shapes: 1-D.   Shapes of each `SparseTensor`.
+//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
+//	hashed_output: If true, returns the hash of the cross instead of the string.
+// This will allow us avoiding string manipulations.
+//	num_buckets: It is used if hashed_output is true.
+// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
+// function to combine the crosses fingerprints.
+//
+//
+//
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated or hashed
+// `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
+	opspec := tf.OpSpec{
+		Type: "SparseCross",
+		Input: []tf.Input{
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns an element-wise indication of the sign of a number.
+//
+// `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
+//
+// For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
+func Sign(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sign",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
+type ResourceSparseApplyFtrlAttr func(optionalAttr)
+
+// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+//
+// That is for rows we have grad for, we update var, accum and linear as follows:
+// accum_new = accum + grad * grad
+// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyFtrl",
+		Input: []tf.Input{
+			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns element-wise smallest integer not less than x.
+func Ceil(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Ceil",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x + y element-wise.
+//
+// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Add",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes rectified linear gradients for a Relu operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Relu operation.
+//	features: The features passed as input to the corresponding Relu operation, OR
+// the outputs of that operation (both work equivalently).
+//
+// Returns `gradients * (features > 0)`.
+func ReluGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReluGrad",
+		Input: []tf.Input{
+			gradients, features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x + y element-wise.
+//
+// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func AddV2(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AddV2",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Pads a tensor with zeros.
+//
+// This operation pads a `input` with zeros according to the `paddings` you
+// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
+// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many zeros to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
+// in that dimension.
+//
+// The padded size of each dimension D of the output is:
+//
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 1], [2, 2]]
+// # 'paddings' is [[1, 1], [2, 2]]
+// # rank of 't' is 2
+// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+//                       [0, 0, 1, 1, 0, 0]
+//                       [0, 0, 2, 2, 0, 0]
+//                       [0, 0, 0, 0, 0, 0]]
+// ```
+//
+func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Pad",
+		Input: []tf.Input{
+			input, paddings,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Get the value of the tensor specified by its handle.
+//
+// Arguments:
+//	handle: The handle for a tensor stored in the session state.
+//	dtype: The type of the output value.
+//
+// Returns The tensor for the given handle.
+func GetSessionTensor(scope *Scope, handle tf.Output, dtype tf.DataType) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "GetSessionTensor",
+		Input: []tf.Input{
+			handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x - y element-wise.
+//
+// *NOTE*: `Subtract` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sub",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x * y element-wise. Returns zero if y is zero, even if x if infinite or NaN.
+//
+// *NOTE*: `Mul` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func MulNoNan(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MulNoNan",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AvgPoolAttr is an optional argument to AvgPool.
+type AvgPoolAttr func(optionalAttr)
+
+// AvgPoolDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolDataFormat(value string) AvgPoolAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs average pooling on the input.
+//
+// Each entry in `output` is the mean of the corresponding size `ksize`
+// window in `value`.
+//
+// Arguments:
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	ksize: The size of the sliding window for each dimension of `value`.
+//	strides: The stride of the sliding window for each dimension of `value`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The average pooled output tensor.
+func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AvgPool",
+		Input: []tf.Input{
+			value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x / y element-wise.
+//
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Div",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x / y element-wise for integer types.
+//
+// Truncation designates that negative numbers will round fractional quantities
+// toward zero. I.e. -7 / 5 = -1. This matches C semantics but it is different
+// than Python semantics. See `FloorDiv` for a division function that matches
+// Python Semantics.
+//
+// *NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TruncateDiv",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Scatter `updates` into a new tensor according to `indices`.
+//
+// Creates a new tensor by applying sparse `updates` to individual values or
+// slices within a tensor (initially zero for numeric, empty for string) of
+// the given `shape` according to indices.  This operator is the inverse of the
+// `tf.gather_nd` operator which extracts values or slices from a given tensor.
+//
+// This operation is similar to tensor_scatter_add, except that the tensor is
+// zero-initialized. Calling `tf.scatter_nd(indices, values, shape)` is identical
+// to `tensor_scatter_add(tf.zeros(shape, values.dtype), indices, values)`
+//
+// If `indices` contains duplicates, then their updates are accumulated (summed).
+//
+// **WARNING**: The order in which updates are applied is nondeterministic, so the
+// output will be nondeterministic if `indices` contains duplicates -- because
+// of some numerical approximation issues, numbers summed in different order
+// may yield different results.
+//
+// `indices` is an integer tensor containing indices into a new tensor of shape
+// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+//
+//     indices.shape[-1] <= shape.rank
+//
+// The last dimension of `indices` corresponds to indices into elements
+// (if `indices.shape[-1] = shape.rank`) or slices
+// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+// `shape`.  `updates` is a tensor with shape
+//
+//     indices.shape[:-1] + shape[indices.shape[-1]:]
+//
+// The simplest form of scatter is to insert individual elements in a tensor by
+// index. For example, say we want to insert 4 scattered elements in a rank-1
+// tensor with 8 elements.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
+// </div>
+//
+// In Python, this scatter operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     shape = tf.constant([8])
+//     scatter = tf.scatter_nd(indices, updates, shape)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [0, 11, 0, 10, 9, 0, 0, 12]
+//
+// We can also, insert entire slices of a higher rank tensor all at once. For
+// example, if we wanted to insert two slices in the first dimension of a
+// rank-3 tensor with two matrices of new values.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd2.png" alt>
+// </div>
+//
+// In Python, this scatter operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[0], [2]])
+//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]],
+//                            [[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
+//     shape = tf.constant([4, 4, 4])
+//     scatter = tf.scatter_nd(indices, updates, shape)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
+//      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, the index is ignored.
+//
+// Arguments:
+//	indices: Index tensor.
+//	updates: Updates to scatter into output.
+//	shape: 1-D. The shape of the resulting tensor.
+//
+// Returns A new tensor with the given shape and updates applied according
+// to the indices.
+func ScatterNd(scope *Scope, indices tf.Output, updates tf.Output, shape tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ScatterNd",
+		Input: []tf.Input{
+			indices, updates, shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the power of one value to another.
+//
+// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
+// corresponding elements in `x` and `y`. For example:
+//
+// ```
+// # tensor 'x' is [[2, 2]], [3, 3]]
+// # tensor 'y' is [[8, 16], [2, 3]]
+// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
+// ```
+func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Pow",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns (x - y)(x - y) element-wise.
+//
+// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SquaredDifference",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns 0 if x == 0, and x / y otherwise, elementwise.
+func Xdivy(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Xdivy",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reverses specific dimensions of a tensor.
+//
+// NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
+// `tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.
+//
+// Given a `tensor`, and a `int32` tensor `axis` representing the set of
+// dimensions of `tensor` to reverse. This operation reverses each dimension
+// `i` for which there exists `j` s.t. `axis[j] == i`.
+//
+// `tensor` can have up to 8 dimensions. The number of dimensions specified
+// in `axis` may be 0 or more entries. If an index is specified more than
+// once, a InvalidArgument error is raised.
+//
+// For example:
+//
+// ```
+// # tensor 't' is [[[[ 0,  1,  2,  3],
+// #                  [ 4,  5,  6,  7],
+// #                  [ 8,  9, 10, 11]],
+// #                 [[12, 13, 14, 15],
+// #                  [16, 17, 18, 19],
+// #                  [20, 21, 22, 23]]]]
+// # tensor 't' shape is [1, 2, 3, 4]
+//
+// # 'dims' is [3] or 'dims' is [-1]
+// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
+//                         [ 7,  6,  5,  4],
+//                         [ 11, 10, 9, 8]],
+//                        [[15, 14, 13, 12],
+//                         [19, 18, 17, 16],
+//                         [23, 22, 21, 20]]]]
+//
+// # 'dims' is '[1]' (or 'dims' is '[-3]')
+// reverse(t, dims) ==> [[[[12, 13, 14, 15],
+//                         [16, 17, 18, 19],
+//                         [20, 21, 22, 23]
+//                        [[ 0,  1,  2,  3],
+//                         [ 4,  5,  6,  7],
+//                         [ 8,  9, 10, 11]]]]
+//
+// # 'dims' is '[2]' (or 'dims' is '[-2]')
+// reverse(t, dims) ==> [[[[8, 9, 10, 11],
+//                         [4, 5, 6, 7],
+//                         [0, 1, 2, 3]]
+//                        [[20, 21, 22, 23],
+//                         [16, 17, 18, 19],
+//                         [12, 13, 14, 15]]]]
+// ```
+//
+// Arguments:
+//	tensor: Up to 8-D.
+//	axis: 1-D. The indices of the dimensions to reverse. Must be in the range
+// `[-rank(tensor), rank(tensor))`.
+//
+// Returns The same shape as `tensor`.
+func ReverseV2(scope *Scope, tensor tf.Output, axis tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReverseV2",
+		Input: []tf.Input{
+			tensor, axis,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseReduceMaxSparseAttr is an optional argument to SparseReduceMaxSparse.
+type SparseReduceMaxSparseAttr func(optionalAttr)
+
+// SparseReduceMaxSparseKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceMaxSparseKeepDims(value bool) SparseReduceMaxSparseAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the max of elements across dimensions of a SparseTensor.
+//
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
+// SparseTensor.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReduceMaxSparse",
+		Input: []tf.Input{
+			input_indices, input_values, input_shape, reduction_axes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Looks up keys in a table, outputs the corresponding values.
+//
+// The tensor `keys` must of the same type as the keys of the table.
+// The output `values` is of the type of the table values.
+//
+// The scalar `default_value` is the value output for keys not present in the
+// table. It must also be of the same type as the table values.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//
+//
+// Returns Same shape as `keys`.  Values found in the table, or `default_values`
+// for missing keys.
+func LookupTableFindV2(scope *Scope, table_handle tf.Output, keys tf.Output, default_value tf.Output) (values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LookupTableFindV2",
+		Input: []tf.Input{
+			table_handle, keys, default_value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
+//
+// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Minimum",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
+//
+// The regularized incomplete beta integral is defined as:
+//
+//
+// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
+//
+// where
+//
+//
+// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
+//
+//
+// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
+// beta function.
+func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Betainc",
+		Input: []tf.Input{
+			a, b, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SpaceToBatch for 4-D tensors of type T.
+//
+// This is a legacy version of the more general SpaceToBatchND.
+//
+// Zero-pads and then rearranges (permutes) blocks of spatial data into batch.
+// More specifically, this op outputs a copy of the input tensor where values from
+// the `height` and `width` dimensions are moved to the `batch` dimension. After
+// the zero-padding, both `height` and `width` of the input must be divisible by the
+// block size.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, height, width, depth]`.
+//	paddings: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
+//   the padding of the input with zeros across the spatial dimensions as follows:
+//
+//       paddings = [[pad_top, pad_bottom], [pad_left, pad_right]]
+//
+//   The effective spatial dimensions of the zero-padded input tensor will be:
+//
+//       height_pad = pad_top + height + pad_bottom
+//       width_pad = pad_left + width + pad_right
+//
+// The attr `block_size` must be greater than one. It indicates the block size.
+//
+//   * Non-overlapping blocks of size `block_size x block size` in the height and
+//     width dimensions are rearranged into the batch dimension at each location.
+//   * The batch of the output tensor is `batch * block_size * block_size`.
+//   * Both height_pad and width_pad must be divisible by block_size.
+//
+// The shape of the output will be:
+//
+//     [batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
+//      depth]
+//
+// Some examples:
+//
+// (1) For the following input of shape `[1, 2, 2, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
+//
+// The output tensor has shape `[4, 1, 1, 1]` and value:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// (2) For the following input of shape `[1, 2, 2, 3]` and block_size of 2:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// The output tensor has shape `[4, 1, 1, 3]` and value:
+//
+// ```
+// [[[[1, 2, 3]]], [[[4, 5, 6]]], [[[7, 8, 9]]], [[[10, 11, 12]]]]
+// ```
+//
+// (3) For the following input of shape `[1, 4, 4, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]],
+//       [[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+//
+// The output tensor has shape `[4, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// (4) For the following input of shape `[2, 2, 4, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]]],
+//      [[[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+//
+// The output tensor has shape `[8, 1, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
+//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
+// ```
+//
+// Among others, this operation is useful for reducing atrous convolution into
+// regular convolution.
+//
+func SpaceToBatch(scope *Scope, input tf.Output, paddings tf.Output, block_size int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"block_size": block_size}
+	opspec := tf.OpSpec{
+		Type: "SpaceToBatch",
+		Input: []tf.Input{
+			input, paddings,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// The shape of the elements of the given list, as a tensor.
+//
+//   input_handle: the list
+//   element_shape: the shape of elements of the list
+func TensorListElementShape(scope *Scope, input_handle tf.Output, shape_type tf.DataType) (element_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shape_type": shape_type}
+	opspec := tf.OpSpec{
+		Type: "TensorListElementShape",
+		Input: []tf.Input{
+			input_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
+//
+// The upper regularized incomplete Gamma function is defined as:
+//
+// \\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
+//
+// where
+//
+// \\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
+//
+// is the upper incomplete Gama function.
+//
+// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
+// Gamma function.
+func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Igammac",
+		Input: []tf.Input{
+			a, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Converts the given `resource_handle` representing an iterator to a variant tensor.
 //
 // Arguments:
@@ -17926,6 +32634,59 @@ func SerializeIterator(scope *Scope, resource_handle tf.Output) (serialized tf.O
 	return op.Output(0)
 }
 
+// CropAndResizeGradBoxesAttr is an optional argument to CropAndResizeGradBoxes.
+type CropAndResizeGradBoxesAttr func(optionalAttr)
+
+// CropAndResizeGradBoxesMethod sets the optional method attribute to value.
+//
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeGradBoxesMethod(value string) CropAndResizeGradBoxesAttr {
+	return func(m optionalAttr) {
+		m["method"] = value
+	}
+}
+
+// Computes the gradient of the crop_and_resize op wrt the input boxes tensor.
+//
+// Arguments:
+//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+// Both `image_height` and `image_width` need to be positive.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//
+// Returns A 2-D tensor of shape `[num_boxes, 4]`.
+func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxes tf.Output, box_ind tf.Output, optional ...CropAndResizeGradBoxesAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CropAndResizeGradBoxes",
+		Input: []tf.Input{
+			grads, image, boxes, box_ind,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug.
 type RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr func(optionalAttr)
 
@@ -17972,43 +32733,390 @@ func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug(scope *Scope, num_shar
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// FakeQuantWithMinMaxVarsGradientAttr is an optional argument to FakeQuantWithMinMaxVarsGradient.
-type FakeQuantWithMinMaxVarsGradientAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxVarsGradientNumBits sets the optional num_bits attribute to value.
+// Compute the lower regularized incomplete Gamma function `P(a, x)`.
 //
-// value: The bitwidth of the quantization; between 2 and 8, inclusive.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxVarsGradientNumBits(value int64) FakeQuantWithMinMaxVarsGradientAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
+// The lower regularized incomplete Gamma function is defined as:
+//
+//
+// \\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
+//
+// where
+//
+// \\(gamma(a, x) = \\int_{0}^{x} t^{a-1} exp(-t) dt\\)
+//
+// is the lower incomplete Gamma function.
+//
+// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
+// Gamma function.
+func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "Igamma",
+		Input: []tf.Input{
+			a, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FakeQuantWithMinMaxVarsGradientNarrowRange sets the optional narrow_range attribute to value.
-//
-// value: Whether to quantize into 2^num_bits - 1 distinct values.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxVarsGradientNarrowRange(value bool) FakeQuantWithMinMaxVarsGradientAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
+// Mutually accumulates multiple tensors of identical type and shape.
+func CollectiveGather(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
+	opspec := tf.OpSpec{
+		Type: "CollectiveGather",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Compute gradients for a FakeQuantWithMinMaxVars operation.
+// Returns element-wise remainder of division. This emulates C semantics in that
+//
+// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
+// y + truncate_mod(x, y) = x`.
+//
+// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TruncateMod",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradient of `igamma(a, x)` wrt `a`.
+func IgammaGradA(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IgammaGradA",
+		Input: []tf.Input{
+			a, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
+//
+// This is the angle \( \theta \in [-\pi, \pi] \) such that
+// \[ x = r \cos(\theta) \]
+// and
+// \[ y = r \sin(\theta) \]
+// where \(r = \sqrt(x^2 + y^2) \).
+func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Atan2",
+		Input: []tf.Input{
+			y, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// An Op to sum inputs across replicated TPU instances.
+//
+// Each instance supplies its own input.
+//
+// For example, suppose there are 8 TPU instances: `[A, B, C, D, E, F, G, H]`.
+// Passing group_assignment=`[[0,2,4,6],[1,3,5,7]]` sets `A, C, E, G` as group 0,
+// and `B, D, F, H` as group 1. Thus we get the outputs:
+// `[A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H]`.
+//
+// Arguments:
+//	input: The local input to the sum.
+//	group_assignment: An int32 tensor with shape
+// [num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
+// replica ids in the ith subgroup.
+//
+// Returns The sum of all the distributed inputs.
+func CrossReplicaSum(scope *Scope, input tf.Output, group_assignment tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "CrossReplicaSum",
+		Input: []tf.Input{
+			input, group_assignment,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of (x > y) element-wise.
+//
+// *NOTE*: `Greater` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Greater",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns immutable tensor from memory region.
+//
+// The current implementation memmaps the tensor from a file.
+//
+// Arguments:
+//	dtype: Type of the returned tensor.
+//	shape: Shape of the returned tensor.
+//	memory_region_name: Name of readonly memory region used by the tensor, see
+// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
+func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
+	opspec := tf.OpSpec{
+		Type: "ImmutableConst",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of (x >= y) element-wise.
+//
+// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GreaterEqual",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of (x == y) element-wise.
+//
+// *NOTE*: `Equal` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Equal(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Equal",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of x AND y element-wise.
+//
+// *NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogicalAnd",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ExperimentalParseExampleDatasetAttr is an optional argument to ExperimentalParseExampleDataset.
+type ExperimentalParseExampleDatasetAttr func(optionalAttr)
+
+// ExperimentalParseExampleDatasetSloppy sets the optional sloppy attribute to value.
+// If not specified, defaults to false
+func ExperimentalParseExampleDatasetSloppy(value bool) ExperimentalParseExampleDatasetAttr {
+	return func(m optionalAttr) {
+		m["sloppy"] = value
+	}
+}
+
+// Transforms `input_dataset` containing `Example` protos as vectors of DT_STRING into a dataset of `Tensor` or `SparseTensor` objects representing the parsed features.
 //
 // Arguments:
-//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation.
-//	inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation.
-// min, max: Quantization interval, scalar floats.
 //
 //
+//	dense_defaults: A dict mapping string keys to `Tensor`s.
+// The keys of the dict must match the dense_keys of the feature.
+//	sparse_keys: A list of string keys in the examples features.
+// The results for these keys will be returned as `SparseTensor` objects.
+//	dense_keys: A list of Ndense string Tensors (scalars).
+// The keys expected in the Examples features associated with dense values.
+//	sparse_types: A list of `DTypes` of the same length as `sparse_keys`.
+// Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
+// and `tf.string` (`BytesList`) are supported.
+//	dense_shapes: List of tuples with the same length as `dense_keys`.
+// The shape of the data for each dense feature referenced by `dense_keys`.
+// Required for any input tensors identified by `dense_keys`.  Must be
+// either fully defined, or may contain an unknown first dimension.
+// An unknown first dimension means the feature is treated as having
+// a variable number of blocks, and the output shape along this dimension
+// is considered unknown at graph build time.  Padding is applied for
+// minibatch elements smaller than the maximum number of blocks for the
+// given feature along this dimension.
+//	output_types: The type list for the return values.
+//	output_shapes: The list of shapes being produced.
+func ExperimentalParseExampleDataset(scope *Scope, input_dataset tf.Output, num_parallel_calls tf.Output, dense_defaults []tf.Output, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ExperimentalParseExampleDatasetAttr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes, "output_types": output_types, "output_shapes": output_shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalParseExampleDataset",
+		Input: []tf.Input{
+			input_dataset, num_parallel_calls, tf.OutputList(dense_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AudioSpectrogramAttr is an optional argument to AudioSpectrogram.
+type AudioSpectrogramAttr func(optionalAttr)
+
+// AudioSpectrogramMagnitudeSquared sets the optional magnitude_squared attribute to value.
 //
-// Returns Backpropagated gradients w.r.t. inputs:
-// `gradients * (inputs >= min && inputs <= max)`.Backpropagated gradients w.r.t. min parameter:
-// `sum(gradients * (inputs < min))`.Backpropagated gradients w.r.t. max parameter:
-// `sum(gradients * (inputs > max))`.
-func FakeQuantWithMinMaxVarsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsGradientAttr) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
+// value: Whether to return the squared magnitude or just the
+// magnitude. Using squared magnitude can avoid extra calculations.
+// If not specified, defaults to false
+func AudioSpectrogramMagnitudeSquared(value bool) AudioSpectrogramAttr {
+	return func(m optionalAttr) {
+		m["magnitude_squared"] = value
+	}
+}
+
+// Produces a visualization of audio data over time.
+//
+// Spectrograms are a standard way of representing audio information as a series of
+// slices of frequency information, one slice for each window of time. By joining
+// these together into a sequence, they form a distinctive fingerprint of the sound
+// over time.
+//
+// This op expects to receive audio data as an input, stored as floats in the range
+// -1 to 1, together with a window width in samples, and a stride specifying how
+// far to move the window between slices. From this it generates a three
+// dimensional output. The first dimension is for the channels in the input, so a
+// stereo audio input would have two here for example. The second dimension is time,
+// with successive frequency slices. The third dimension has an amplitude value for
+// each frequency during that time slice.
+//
+// This means the layout when converted and saved as an image is rotated 90 degrees
+// clockwise from a typical spectrogram. Time is descending down the Y axis, and
+// the frequency decreases from left to right.
+//
+// Each value in the result represents the square root of the sum of the real and
+// imaginary parts of an FFT on the current window of samples. In this way, the
+// lowest dimension represents the power of each frequency in the current window,
+// and adjacent windows are concatenated in the next dimension.
+//
+// To get a more intuitive and visual look at what this operation does, you can run
+// tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the
+// resulting spectrogram as a PNG image.
+//
+// Arguments:
+//	input: Float representation of audio data.
+//	window_size: How wide the input window is in samples. For the highest efficiency
+// this should be a power of two, but other values are accepted.
+//	stride: How widely apart the center of adjacent sample windows should be.
+//
+// Returns 3D representation of the audio frequencies as an image.
+func AudioSpectrogram(scope *Scope, input tf.Output, window_size int64, stride int64, optional ...AudioSpectrogramAttr) (spectrogram tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"window_size": window_size, "stride": stride}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AudioSpectrogram",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MatMulAttr is an optional argument to MatMul.
+type MatMulAttr func(optionalAttr)
+
+// MatMulTransposeA sets the optional transpose_a attribute to value.
+//
+// value: If true, "a" is transposed before multiplication.
+// If not specified, defaults to false
+func MatMulTransposeA(value bool) MatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// MatMulTransposeB sets the optional transpose_b attribute to value.
+//
+// value: If true, "b" is transposed before multiplication.
+// If not specified, defaults to false
+func MatMulTransposeB(value bool) MatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// Multiply the matrix "a" by the matrix "b".
+//
+// The inputs must be two-dimensional matrices and the inner dimension of
+// "a" (after being transposed if transpose_a is true) must match the
+// outer dimension of "b" (after being transposed if transposed_b is
+// true).
+//
+// *Note*: The default kernel implementation for MatMul on GPUs uses
+// cublas.
+func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18017,9 +33125,328 @@ func FakeQuantWithMinMaxVarsGradient(scope *Scope, gradients tf.Output, inputs t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVarsGradient",
+		Type: "MatMul",
 		Input: []tf.Input{
-			gradients, inputs, min, max,
+			a, b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SumAttr is an optional argument to Sum.
+type SumAttr func(optionalAttr)
+
+// SumKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SumKeepDims(value bool) SumAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the sum of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Sum(scope *Scope, input tf.Output, axis tf.Output, optional ...SumAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Sum",
+		Input: []tf.Input{
+			input, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingAdagradParametersGradAccumDebug.
+type RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve Adagrad embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the Adagrad optimization algorithm.Parameter accumulators updated by the Adagrad optimization algorithm.Parameter gradient_accumulators updated by the Adagrad optimization algorithm.
+func RetrieveTPUEmbeddingAdagradParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingAdagradParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Batch normalization.
+//
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+//
+// This op is deprecated. Prefer `tf.nn.batch_normalization`.
+//
+// Arguments:
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	beta: A 1D beta Tensor with size matching the last dimension of t.
+// An offset to be added to the normalized tensor.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this tensor will be multiplied
+// with the normalized tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	opspec := tf.OpSpec{
+		Type: "BatchNormWithGlobalNormalization",
+		Input: []tf.Input{
+			t, m, v, beta, gamma,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ProdAttr is an optional argument to Prod.
+type ProdAttr func(optionalAttr)
+
+// ProdKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func ProdKeepDims(value bool) ProdAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the product of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Prod(scope *Scope, input tf.Output, axis tf.Output, optional ...ProdAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Prod",
+		Input: []tf.Input{
+			input, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Decode the frame(s) of a GIF-encoded image to a uint8 tensor.
+//
+// GIF images with frame or transparency compression are not supported.
+// On Linux and MacOS systems, convert animated GIFs from compressed to
+// uncompressed by running:
+//
+//     convert $src.gif -coalesce $dst.gif
+//
+// This op also supports decoding JPEGs and PNGs, though it is cleaner to use
+// `tf.image.decode_image`.
+//
+// Arguments:
+//	contents: 0-D.  The GIF-encoded image.
+//
+// Returns 4-D with shape `[num_frames, height, width, 3]`. RGB channel order.
+func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeGif",
+		Input: []tf.Input{
+			contents,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EditDistanceAttr is an optional argument to EditDistance.
+type EditDistanceAttr func(optionalAttr)
+
+// EditDistanceNormalize sets the optional normalize attribute to value.
+//
+// value: boolean (if true, edit distances are normalized by length of truth).
+//
+// The output is:
+// If not specified, defaults to true
+func EditDistanceNormalize(value bool) EditDistanceAttr {
+	return func(m optionalAttr) {
+		m["normalize"] = value
+	}
+}
+
+// Computes the (possibly normalized) Levenshtein Edit Distance.
+//
+// The inputs are variable-length sequences provided by SparseTensors
+//   (hypothesis_indices, hypothesis_values, hypothesis_shape)
+// and
+//   (truth_indices, truth_values, truth_shape).
+//
+// The inputs are:
+//
+// Arguments:
+//	hypothesis_indices: The indices of the hypothesis list SparseTensor.
+// This is an N x R int64 matrix.
+//	hypothesis_values: The values of the hypothesis list SparseTensor.
+// This is an N-length vector.
+//	hypothesis_shape: The shape of the hypothesis list SparseTensor.
+// This is an R-length vector.
+//	truth_indices: The indices of the truth list SparseTensor.
+// This is an M x R int64 matrix.
+//	truth_values: The values of the truth list SparseTensor.
+// This is an M-length vector.
+//	truth_shape: truth indices, vector.
+//
+// Returns A dense float tensor with rank R - 1.
+//
+// For the example input:
+//
+//     // hypothesis represents a 2x1 matrix with variable-length values:
+//     //   (0,0) = ["a"]
+//     //   (1,0) = ["b"]
+//     hypothesis_indices = [[0, 0, 0],
+//                           [1, 0, 0]]
+//     hypothesis_values = ["a", "b"]
+//     hypothesis_shape = [2, 1, 1]
+//
+//     // truth represents a 2x2 matrix with variable-length values:
+//     //   (0,0) = []
+//     //   (0,1) = ["a"]
+//     //   (1,0) = ["b", "c"]
+//     //   (1,1) = ["a"]
+//     truth_indices = [[0, 1, 0],
+//                      [1, 0, 0],
+//                      [1, 0, 1],
+//                      [1, 1, 0]]
+//     truth_values = ["a", "b", "c", "a"]
+//     truth_shape = [2, 2, 2]
+//     normalize = true
+//
+// The output will be:
+//
+//     // output is a 2x2 matrix with edit distances normalized by truth lengths.
+//     output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
+//               [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
+func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values tf.Output, hypothesis_shape tf.Output, truth_indices tf.Output, truth_values tf.Output, truth_shape tf.Output, optional ...EditDistanceAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EditDistance",
+		Input: []tf.Input{
+			hypothesis_indices, hypothesis_values, hypothesis_shape, truth_indices, truth_values, truth_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds Tensor 'bias' to Tensor 'input' for Quantized types.
+//
+// Broadcasts the values of bias on dimensions 0..N-2 of 'input'.
+//
+// Arguments:
+//
+//	bias: A 1D bias Tensor with size matching the last dimension of 'input'.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_bias: The float value that the lowest quantized bias value represents.
+//	max_bias: The float value that the highest quantized bias value represents.
+//
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedBiasAdd(scope *Scope, input tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_bias tf.Output, max_bias tf.Output, out_type tf.DataType) (output tf.Output, min_out tf.Output, max_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	opspec := tf.OpSpec{
+		Type: "QuantizedBiasAdd",
+		Input: []tf.Input{
+			input, bias, min_input, max_input, min_bias, max_bias,
 		},
 		Attrs: attrs,
 	}
@@ -18027,34 +33454,33 @@ func FakeQuantWithMinMaxVarsGradient(scope *Scope, gradients tf.Output, inputs t
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// RegexReplaceAttr is an optional argument to RegexReplace.
-type RegexReplaceAttr func(optionalAttr)
+// MinAttr is an optional argument to Min.
+type MinAttr func(optionalAttr)
 
-// RegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
+// MinKeepDims sets the optional keep_dims attribute to value.
 //
-// value: If True, the replacement is global (that is, all matches of the `pattern` regular
-// expression in each input string are rewritten), otherwise the `rewrite`
-// substitution is only made for the first `pattern` match.
-// If not specified, defaults to true
-func RegexReplaceReplaceGlobal(value bool) RegexReplaceAttr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MinKeepDims(value bool) MinAttr {
 	return func(m optionalAttr) {
-		m["replace_global"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Replaces matches of the `pattern` regular expression in `input` with the
-// replacement string provided in `rewrite`.
+// Computes the minimum of elements across dimensions of a tensor.
 //
-// It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	input: The text to be processed.
-//	pattern: The regular expression to be matched in the `input` strings.
-//	rewrite: The rewrite string to be substituted for the `pattern` expression where it is
-// matched in the `input` strings.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns The text after applying pattern match and rewrite substitution.
-func RegexReplace(scope *Scope, input tf.Output, pattern tf.Output, rewrite tf.Output, optional ...RegexReplaceAttr) (output tf.Output) {
+// Returns The reduced tensor.
+func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18063,9 +33489,9 @@ func RegexReplace(scope *Scope, input tf.Output, pattern tf.Output, rewrite tf.O
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RegexReplace",
+		Type: "Min",
 		Input: []tf.Input{
-			input, pattern, rewrite,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
@@ -18073,83 +33499,37 @@ func RegexReplace(scope *Scope, input tf.Output, pattern tf.Output, rewrite tf.O
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArraySplitV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArraySplitV3
-func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArraySplitV2",
-		Input: []tf.Input{
-			handle, value, lengths, flow_in,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// ArgMaxAttr is an optional argument to ArgMax.
+type ArgMaxAttr func(optionalAttr)
 
-// ResourceScatterNdSubAttr is an optional argument to ResourceScatterNdSub.
-type ResourceScatterNdSubAttr func(optionalAttr)
-
-// ResourceScatterNdSubUseLocking sets the optional use_locking attribute to value.
-//
-// value: An optional bool. Defaults to True. If True, the assignment will
-// be protected by a lock; otherwise the behavior is undefined,
-// but may exhibit less contention.
-// If not specified, defaults to true
-func ResourceScatterNdSubUseLocking(value bool) ResourceScatterNdSubAttr {
+// ArgMaxOutputType sets the optional output_type attribute to value.
+// If not specified, defaults to DT_INT64
+func ArgMaxOutputType(value tf.DataType) ArgMaxAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["output_type"] = value
 	}
 }
 
-// Applies sparse subtraction to individual values or slices in a Variable.
+// Returns the index with the largest value across dimensions of a tensor.
 //
-// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+// Note that in case of ties the identity of the return value is not guaranteed.
 //
-// `indices` must be integer tensor, containing indices into `ref`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `ref`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
-// ```
-//
-// For example, say we want to subtract 4 scattered elements from a rank-1 tensor
-// with 8 elements. In Python, that subtraction would look like this:
-//
-// ```python
-// ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
-// indices = tf.constant([[4], [3], [1], [7]])
-// updates = tf.constant([9, 10, 11, 12])
-// sub = tf.scatter_nd_sub(ref, indices, updates)
-// with tf.Session() as sess:
-//   print sess.run(sub)
-// ```
-//
-// The resulting update to ref would look like this:
-//
-//     [1, -9, 3, -6, -4, 6, 7, -4]
-//
-// See `tf.scatter_nd` for more details about how to make updates to
-// slices.
+// Usage:
+//   ```python
+//   import tensorflow as tf
+//   a = [1, 10, 26.9, 2.8, 166.32, 62.3]
+//   b = tf.math.argmax(input = a)
+//   c = tf.keras.backend.eval(b)
+//   # c = 4
+//   # here a[4] = 166.32 which is the largest element of a across axis 0
+//   ```
 //
 // Arguments:
-//	ref: A resource handle. Must be from a VarHandleOp.
-//	indices: A Tensor. Must be one of the following types: int32, int64.
-// A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as ref. A tensor of
-// values to add to ref.
 //
-// Returns the created operation.
-func ResourceScatterNdSub(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdSubAttr) (o *tf.Operation) {
+//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+// Describes which dimension of the input Tensor to reduce across. For vectors,
+// use dimension = 0.
+func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18158,95 +33538,1425 @@ func ResourceScatterNdSub(scope *Scope, ref tf.Output, indices tf.Output, update
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterNdSub",
+		Type: "ArgMax",
 		Input: []tf.Input{
-			ref, indices, updates,
+			input, dimension,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the mean along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
+// over `j` such that `segment_ids[j] == i` and `N` is the total number of
+// values summed.
+//
+// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
+// </div>
+//
+// For example:
+//
+// ```
+// c = tf.constant([[1.0,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+// tf.segment_mean(c, tf.constant([0, 0, 1]))
+// # ==> [[2.5, 2.5, 2.5, 2.5],
+// #      [5, 6, 7, 8]]
+// ```
+//
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentMean",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the product along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output_i = \prod_j data_j\\) where the product is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the product is empty for a given segment ID `i`, `output[i] = 1`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
+// </div>
+//
+// For example:
+//
+// ```
+// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+// tf.segment_prod(c, tf.constant([0, 0, 1]))
+// # ==> [[4, 6, 6, 4],
+// #      [5, 6, 7, 8]]
+// ```
+//
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentProd",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// PackAttr is an optional argument to Pack.
+type PackAttr func(optionalAttr)
+
+// PackAxis sets the optional axis attribute to value.
+//
+// value: Dimension along which to pack.  Negative values wrap around, so the
+// valid range is `[-(R+1), R+1)`.
+// If not specified, defaults to 0
+func PackAxis(value int64) PackAttr {
+	return func(m optionalAttr) {
+		m["axis"] = value
+	}
+}
+
+// Packs a list of `N` rank-`R` tensors into one rank-`(R+1)` tensor.
+//
+// Packs the `N` tensors in `values` into a tensor with rank one higher than each
+// tensor in `values`, by packing them along the `axis` dimension.
+// Given a list of tensors of shape `(A, B, C)`;
+//
+// if `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.
+// if `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.
+// Etc.
+//
+// For example:
+//
+// ```
+// # 'x' is [1, 4]
+// # 'y' is [2, 5]
+// # 'z' is [3, 6]
+// pack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
+// pack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]
+// ```
+//
+// This is the opposite of `unpack`.
+//
+// Arguments:
+//	values: Must be of same shape and type.
+//
+// Returns The packed tensor.
+func Pack(scope *Scope, values []tf.Output, optional ...PackAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Pack",
+		Input: []tf.Input{
+			tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the minimum along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// This operator is similar to the unsorted segment sum operator found
+// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the minimum such that:
+//
+// \\(output_i = \min_{j...} data_[j...]\\) where min is over tuples `j...` such
+// that `segment_ids[j...] == i`.
+//
+// If the minimum is empty for a given segment ID `i`, it outputs the largest
+// possible value for the specific numeric type,
+// `output[i] = numeric_limits<T>::max()`.
+//
+// For example:
+//
+// ``` python
+// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+// tf.unsorted_segment_min(c, tf.constant([0, 1, 0]), num_segments=2)
+// # ==> [[ 1,  2, 2, 1],
+// #       [5,  6, 7, 8]]
+// ```
+//
+// If the given segment ID `i` is negative, then the corresponding value is
+// dropped, and will not be included in the result.
+//
+// Arguments:
+//
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnsortedSegmentMin",
+		Input: []tf.Input{
+			data, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResizeBicubicGradAttr is an optional argument to ResizeBicubicGrad.
+type ResizeBicubicGradAttr func(optionalAttr)
+
+// ResizeBicubicGradAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
+// If not specified, defaults to false
+func ResizeBicubicGradAlignCorners(value bool) ResizeBicubicGradAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// ResizeBicubicGradHalfPixelCenters sets the optional half_pixel_centers attribute to value.
+// If not specified, defaults to false
+func ResizeBicubicGradHalfPixelCenters(value bool) ResizeBicubicGradAttr {
+	return func(m optionalAttr) {
+		m["half_pixel_centers"] = value
+	}
+}
+
+// Computes the gradient of bicubic interpolation.
+//
+// Arguments:
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
+// The image tensor that was resized.
+//
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
+// Gradients with respect to the input image. Input image must have been
+// float or double.
+func ResizeBicubicGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBicubicGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeBicubicGrad",
+		Input: []tf.Input{
+			grads, original_image,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the product along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// This operator is similar to the unsorted segment sum operator found
+// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the product of all
+// entries belonging to a segment such that:
+//
+// \\(output_i = \prod_{j...} data[j...]\\) where the product is over tuples
+// `j...` such that `segment_ids[j...] == i`.
+//
+// For example:
+//
+// ``` python
+// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+// tf.unsorted_segment_prod(c, tf.constant([0, 1, 0]), num_segments=2)
+// # ==> [[ 4,  6, 6, 4],
+// #       [5,  6, 7, 8]]
+// ```
+//
+// If there is no entry for a given segment ID `i`, it outputs 1.
+//
+// If the given segment ID `i` is negative, then the corresponding value is
+// dropped, and will not be included in the result.
+//
+// Arguments:
+//
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnsortedSegmentProd",
+		Input: []tf.Input{
+			data, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that passes a sliding window over `input_dataset`.
+//
+// Arguments:
+//
+//	window_size: A scalar representing the number of elements in the
+// sliding window.
+//	window_shift: A scalar representing the steps moving the sliding window
+// forward in one iteration. It must be positive.
+//	window_stride: A scalar representing the stride of the input elements of the sliding window.
+// It must be positive.
+//
+//
+func ExperimentalSlidingWindowDataset(scope *Scope, input_dataset tf.Output, window_size tf.Output, window_shift tf.Output, window_stride tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalSlidingWindowDataset",
+		Input: []tf.Input{
+			input_dataset, window_size, window_shift, window_stride,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes sigmoid of `x` element-wise.
+//
+// Specifically, `y = 1 / (1 + exp(-x))`.
+func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sigmoid",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the sum along sparse segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
+//
+// For example:
+//
+// ```python
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+//
+// # Select two rows, one segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
+// # => [[0 0 0 0]]
+//
+// # Select two rows, two segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
+// # => [[ 1  2  3  4]
+// #     [-1 -2 -3 -4]]
+//
+// # Select all rows, two segments.
+// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
+// # => [[0 0 0 0]
+// #     [5 6 7 8]]
+//
+// # Which is equivalent to:
+// tf.segment_sum(c, tf.constant([0, 0, 1]))
+// ```
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentSum",
+		Input: []tf.Input{
+			data, indices, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the mean along sparse segments of a tensor.
+//
+// See `tf.sparse.segment_sum` for usage examples.
+//
+// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentMean",
+		Input: []tf.Input{
+			data, indices, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the mean along sparse segments of a tensor.
+//
+// Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
+//
+// Returns Has same shape as data, except for dimension 0 which has size
+// `num_segments`.
+func SparseSegmentMeanWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentMeanWithNumSegments",
+		Input: []tf.Input{
+			data, indices, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MergeV2CheckpointsAttr is an optional argument to MergeV2Checkpoints.
+type MergeV2CheckpointsAttr func(optionalAttr)
+
+// MergeV2CheckpointsDeleteOldDirs sets the optional delete_old_dirs attribute to value.
+//
+// value: see above.
+// If not specified, defaults to true
+func MergeV2CheckpointsDeleteOldDirs(value bool) MergeV2CheckpointsAttr {
+	return func(m optionalAttr) {
+		m["delete_old_dirs"] = value
+	}
+}
+
+// V2 format specific: merges the metadata files of sharded checkpoints.  The
+//
+// result is one logical checkpoint, with one physical metadata file and renamed
+// data files.
+//
+// Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
+//
+// If delete_old_dirs is true, attempts to delete recursively the dirname of each
+// path in the input checkpoint_prefixes.  This is useful when those paths are non
+// user-facing temporary locations.
+//
+// Arguments:
+//	checkpoint_prefixes: prefixes of V2 checkpoints to merge.
+//	destination_prefix: scalar.  The desired final prefix.  Allowed to be the same
+// as one of the checkpoint_prefixes.
+//
+// Returns the created operation.
+func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination_prefix tf.Output, optional ...MergeV2CheckpointsAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MergeV2Checkpoints",
+		Input: []tf.Input{
+			checkpoint_prefixes, destination_prefix,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// The gradient of SparseFillEmptyRows.
+// MeanAttr is an optional argument to Mean.
+type MeanAttr func(optionalAttr)
+
+// MeanKeepDims sets the optional keep_dims attribute to value.
 //
-// Takes vectors reverse_index_map, shaped `[N]`, and grad_values,
-// shaped `[N_full]`, where `N_full >= N` and copies data into either
-// `d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and
-// `d_default_value` is a scalar.
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MeanKeepDims(value bool) MeanAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the mean of elements across dimensions of a tensor.
 //
-//   d_values[j] = grad_values[reverse_index_map[j]]
-//   d_default_value = sum_{k : 0 .. N_full - 1} (
-//      grad_values[k] * 1{k not in reverse_index_map})
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	reverse_index_map: 1-D.  The reverse index map from SparseFillEmptyRows.
-//	grad_values: 1-D.  The gradients from backprop.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns 1-D.  The backprop into values.0-D.  The backprop into default_value.
-func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_values tf.Output) (d_values tf.Output, d_default_value tf.Output) {
+// Returns The reduced tensor.
+func Mean(scope *Scope, input tf.Output, axis tf.Output, optional ...MeanAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Mean",
+		Input: []tf.Input{
+			input, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
+//
+// N is the size of the segment being reduced.
+//
+// See `tf.sparse.segment_sum` for usage examples.
+//
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseFillEmptyRowsGrad",
+		Type: "SparseSegmentSqrtN",
 		Input: []tf.Input{
-			reverse_index_map, grad_values,
+			data, indices, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Removes keys and its associated values from a table.
+//
+// The tensor `keys` must of the same type as the keys of the table. Keys not
+// already in the table are silently ignored.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys of the elements to remove.
+//
+// Returns the created operation.
+func LookupTableRemoveV2(scope *Scope, table_handle tf.Output, keys tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LookupTableRemoveV2",
+		Input: []tf.Input{
+			table_handle, keys,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ResourceGatherAttr is an optional argument to ResourceGather.
+type ResourceGatherAttr func(optionalAttr)
+
+// ResourceGatherBatchDims sets the optional batch_dims attribute to value.
+// If not specified, defaults to 0
+func ResourceGatherBatchDims(value int64) ResourceGatherAttr {
+	return func(m optionalAttr) {
+		m["batch_dims"] = value
+	}
+}
+
+// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Gather slices from the variable pointed to by `resource` according to `indices`.
+//
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
+//
+// ```python
+//     # Scalar indices
+//     output[:, ..., :] = params[indices, :, ... :]
+//
+//     # Vector indices
+//     output[i, :, ..., :] = params[indices[i], :, ... :]
+//
+//     # Higher rank indices
+//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
+// ```
+func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceGather",
+		Input: []tf.Input{
+			resource, indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
+//
+// N is the size of the segment being reduced.
+//
+// Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtNWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentSqrtNWithNumSegments",
+		Input: []tf.Input{
+			data, indices, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AnyAttr is an optional argument to Any.
+type AnyAttr func(optionalAttr)
+
+// AnyKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func AnyKeepDims(value bool) AnyAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the "logical or" of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Any(scope *Scope, input tf.Output, axis tf.Output, optional ...AnyAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Any",
+		Input: []tf.Input{
+			input, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes gradients for SparseSegmentSqrtN.
+//
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
+//
+// Arguments:
+//	grad: gradient propagated to the SparseSegmentSqrtN op.
+//	indices: indices passed to the corresponding SparseSegmentSqrtN op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
+func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentSqrtNGrad",
+		Input: []tf.Input{
+			grad, indices, segment_ids, output_dim0,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AllAttr is an optional argument to All.
+type AllAttr func(optionalAttr)
+
+// AllKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func AllKeepDims(value bool) AllAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the "logical and" of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func All(scope *Scope, input tf.Output, axis tf.Output, optional ...AllAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "All",
+		Input: []tf.Input{
+			input, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MapClearAttr is an optional argument to MapClear.
+type MapClearAttr func(optionalAttr)
+
+// MapClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapClearCapacity(value int64) MapClearAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapClearMemoryLimit(value int64) MapClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapClearContainer(value string) MapClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapClearSharedName(value string) MapClearAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes all elements in the underlying container.
+//
+// Returns the created operation.
+func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapClear",
+
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Creates a sequence of numbers.
+//
+// This operation creates a sequence of numbers that begins at `start` and
+// extends by increments of `delta` up to but not including `limit`.
+//
+// For example:
+//
+// ```
+// # 'start' is 3
+// # 'limit' is 18
+// # 'delta' is 3
+// tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
+// ```
+//
+// Arguments:
+//	start: 0-D (scalar). First entry in the sequence.
+//	limit: 0-D (scalar). Upper limit of sequence, exclusive.
+//	delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
+//
+// Returns 1-D.
+func Range(scope *Scope, start tf.Output, limit tf.Output, delta tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Range",
+		Input: []tf.Input{
+			start, limit, delta,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ComplexAttr is an optional argument to Complex.
+type ComplexAttr func(optionalAttr)
+
+// ComplexTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_COMPLEX64
+func ComplexTout(value tf.DataType) ComplexAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Converts two real numbers to a complex number.
+//
+// Given a tensor `real` representing the real part of a complex number, and a
+// tensor `imag` representing the imaginary part of a complex number, this
+// operation returns complex numbers elementwise of the form \\(a + bj\\), where
+// *a* represents the `real` part and *b* represents the `imag` part.
+//
+// The input tensors `real` and `imag` must have the same shape.
+//
+// For example:
+//
+// ```
+// # tensor 'real' is [2.25, 3.25]
+// # tensor `imag` is [4.75, 5.75]
+// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
+// ```
+func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Complex",
+		Input: []tf.Input{
+			real, imag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RandomShuffleAttr is an optional argument to RandomShuffle.
+type RandomShuffleAttr func(optionalAttr)
+
+// RandomShuffleSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomShuffleSeed(value int64) RandomShuffleAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomShuffleSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomShuffleSeed2(value int64) RandomShuffleAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Randomly shuffles a tensor along its first dimension.
+//
+//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
+//   to one and only one `output[i]`. For example, a mapping that might occur for a
+//   3x2 tensor is:
+//
+// ```
+// [[1, 2],       [[5, 6],
+//  [3, 4],  ==>   [1, 2],
+//  [5, 6]]        [3, 4]]
+// ```
+//
+// Arguments:
+//	value: The tensor to be shuffled.
+//
+// Returns A tensor of same shape and type as `value`, shuffled along its first
+// dimension.
+func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomShuffle",
+		Input: []tf.Input{
+			value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RealAttr is an optional argument to Real.
+type RealAttr func(optionalAttr)
+
+// RealTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func RealTout(value tf.DataType) RealAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Returns the real part of a complex number.
+//
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the real part of each element in `input`. All elements in
+// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
+//  part returned by this operation and *b* is the imaginary part.
+//
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.real(input) ==> [-2.25, 3.25]
+// ```
+func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Real",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ImagAttr is an optional argument to Imag.
+type ImagAttr func(optionalAttr)
+
+// ImagTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ImagTout(value tf.DataType) ImagAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Returns the imaginary part of a complex number.
+//
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the imaginary part of each element in `input`. All
+// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part returned by this operation.
+//
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.imag(input) ==> [4.75, 5.75]
+// ```
+func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Imag",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// BoostedTreesCreateQuantileStreamResourceAttr is an optional argument to BoostedTreesCreateQuantileStreamResource.
+type BoostedTreesCreateQuantileStreamResourceAttr func(optionalAttr)
+
+// BoostedTreesCreateQuantileStreamResourceMaxElements sets the optional max_elements attribute to value.
+//
+// value: int; The maximum number of data points that can be fed to the stream.
+// If not specified, defaults to 1099511627776
+func BoostedTreesCreateQuantileStreamResourceMaxElements(value int64) BoostedTreesCreateQuantileStreamResourceAttr {
+	return func(m optionalAttr) {
+		m["max_elements"] = value
+	}
+}
+
+// Create the Resource for Quantile Streams.
+//
+// Arguments:
+//	quantile_stream_resource_handle: resource; Handle to quantile stream resource.
+//	epsilon: float; The required approximation error of the stream resource.
+//	num_streams: int; The number of streams managed by the resource that shares the same epsilon.
+//
+// Returns the created operation.
+func BoostedTreesCreateQuantileStreamResource(scope *Scope, quantile_stream_resource_handle tf.Output, epsilon tf.Output, num_streams tf.Output, optional ...BoostedTreesCreateQuantileStreamResourceAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesCreateQuantileStreamResource",
+		Input: []tf.Input{
+			quantile_stream_resource_handle, epsilon, num_streams,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// HistogramFixedWidthAttr is an optional argument to HistogramFixedWidth.
+type HistogramFixedWidthAttr func(optionalAttr)
+
+// HistogramFixedWidthDtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_INT32
+func HistogramFixedWidthDtype(value tf.DataType) HistogramFixedWidthAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Return histogram of values.
+//
+// Given the tensor `values`, this operation returns a rank 1 histogram counting
+// the number of entries in `values` that fall into every bin.  The bins are
+// equal width and determined by the arguments `value_range` and `nbins`.
+//
+// ```python
+// # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
+// nbins = 5
+// value_range = [0.0, 5.0]
+// new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
+//
+// with tf.get_default_session() as sess:
+//   hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
+//   variables.global_variables_initializer().run()
+//   sess.run(hist) => [2, 1, 1, 0, 2]
+// ```
+//
+// Arguments:
+//	values: Numeric `Tensor`.
+//	value_range: Shape [2] `Tensor` of same `dtype` as `values`.
+// values <= value_range[0] will be mapped to hist[0],
+// values >= value_range[1] will be mapped to hist[-1].
+//	nbins: Scalar `int32 Tensor`.  Number of histogram bins.
+//
+// Returns A 1-D `Tensor` holding histogram of values.
+func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output, nbins tf.Output, optional ...HistogramFixedWidthAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "HistogramFixedWidth",
+		Input: []tf.Input{
+			values, value_range, nbins,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Scatters tensor at indices in an input list.
+//
+// Each member of the TensorList corresponds to one row of the input tensor,
+// specified by the given index (see `tf.gather`).
+//
+// input_handle: The list to scatter into.
+// tensor: The input tensor.
+// indices: The indices used to index into the list.
+// output_handle: The TensorList.
+func TensorListScatterIntoExistingList(scope *Scope, input_handle tf.Output, tensor tf.Output, indices tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListScatterIntoExistingList",
+		Input: []tf.Input{
+			input_handle, tensor, indices,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Worker heartbeat op.
+//
+// Heartbeats may be sent periodically to indicate the coordinator is still active,
+// to retrieve the current worker status and to expedite shutdown when necessary.
+//
+// Arguments:
+//	request: A string tensor containing a serialized WorkerHeartbeatRequest
+//
+// Returns A string tensor containing a serialized WorkerHeartbeatResponse
+func WorkerHeartbeat(scope *Scope, request tf.Output) (response tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "WorkerHeartbeat",
+		Input: []tf.Input{
+			request,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes square of x element-wise.
+//
+// I.e., \\(y = x * x = x^2\\).
+func Square(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Square",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CumprodAttr is an optional argument to Cumprod.
+type CumprodAttr func(optionalAttr)
+
+// CumprodExclusive sets the optional exclusive attribute to value.
+//
+// value: If `True`, perform exclusive cumprod.
+// If not specified, defaults to false
+func CumprodExclusive(value bool) CumprodAttr {
+	return func(m optionalAttr) {
+		m["exclusive"] = value
+	}
+}
+
+// CumprodReverse sets the optional reverse attribute to value.
+//
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumprodReverse(value bool) CumprodAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
+	}
+}
+
+// Compute the cumulative product of the tensor `x` along `axis`.
+//
+// By default, this op performs an inclusive cumprod, which means that the first
+// element of the input is identical to the first element of the output:
+//
+// ```python
+// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
+// ```
+//
+// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
+// performed instead:
+//
+// ```python
+// tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
+// ```
+//
+// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Cumprod",
+		Input: []tf.Input{
+			x, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes a range that covers the actual values present in a quantized tensor.
+//
+// Given a quantized tensor described by `(input, input_min, input_max)`, outputs a
+// range that covers the actual values present in that tensor. This op is typically
+// used to produce the `requested_output_min` and `requested_output_max` for
+// `Requantize`.
+//
+// Arguments:
+//
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//
+// Returns The computed min output.the computed max output.
+func RequantizationRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output) (output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RequantizationRange",
+		Input: []tf.Input{
+			input, input_min, input_max,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1)
 }
 
-// Strip leading and trailing whitespaces from the Tensor.
+// Clips tensor values to a specified min and max.
+//
+// Given a tensor `t`, this operation returns a tensor of the same type and
+// shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
+// Any values less than `clip_value_min` are set to `clip_value_min`. Any values
+// greater than `clip_value_max` are set to `clip_value_max`.
 //
 // Arguments:
-//	input: A string `Tensor` of any shape.
+//	t: A `Tensor`.
+//	clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The minimum value to clip by.
+//	clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The maximum value to clip by.
 //
-// Returns A string `Tensor` of the same shape as the input.
-func StringStrip(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns A clipped `Tensor` with the same shape as input 't'.
+func ClipByValue(scope *Scope, t tf.Output, clip_value_min tf.Output, clip_value_max tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "StringStrip",
+		Type: "ClipByValue",
 		Input: []tf.Input{
-			input,
+			t, clip_value_min, clip_value_max,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyProximalAdagradAttr is an optional argument to ResourceApplyProximalAdagrad.
-type ResourceApplyProximalAdagradAttr func(optionalAttr)
+// RequantizePerChannelAttr is an optional argument to RequantizePerChannel.
+type RequantizePerChannelAttr func(optionalAttr)
 
-// ResourceApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+// RequantizePerChannelOutType sets the optional out_type attribute to value.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyProximalAdagradUseLocking(value bool) ResourceApplyProximalAdagradAttr {
+// value: The quantized type of output tensor that needs to be converted.
+// If not specified, defaults to DT_QUINT8
+func RequantizePerChannelOutType(value tf.DataType) RequantizePerChannelAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["out_type"] = value
 	}
 }
 
-// Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
-//
-// accum += grad * grad
-// prox_v = var - lr * grad * (1 / sqrt(accum))
-// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+// Requantizes input with min and max values known per channel.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
+//	input: The original input tensor.
+//	input_min: The minimum value of the input tensor
+//	input_max: The maximum value of the input tensor.
+//	requested_output_min: The minimum value of the output tensor requested.
+//	requested_output_max: The maximum value of the output tensor requested.
 //
-// Returns the created operation.
-func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, optional ...ResourceApplyProximalAdagradAttr) (o *tf.Operation) {
+// Returns Output tensor.The minimum value of the final output tensorThe maximum value of the final output tensor.
+func RequantizePerChannel(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, optional ...RequantizePerChannelAttr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18255,35 +34965,1599 @@ func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyProximalAdagrad",
+		Type: "RequantizePerChannel",
 		Input: []tf.Input{
-			var_, accum, lr, l1, l2, grad,
+			input, input_min, input_max, requested_output_min, requested_output_max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// ReduceJoinAttr is an optional argument to ReduceJoin.
+type ReduceJoinAttr func(optionalAttr)
+
+// ReduceJoinKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If `True`, retain reduced dimensions with length `1`.
+// If not specified, defaults to false
+func ReduceJoinKeepDims(value bool) ReduceJoinAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// ReduceJoinSeparator sets the optional separator attribute to value.
+//
+// value: The separator to use when joining.
+// If not specified, defaults to ""
+func ReduceJoinSeparator(value string) ReduceJoinAttr {
+	return func(m optionalAttr) {
+		m["separator"] = value
+	}
+}
+
+// Joins a string Tensor across the given dimensions.
+//
+// Computes the string join across dimensions in the given string Tensor of shape
+// `[\\(d_0, d_1, ..., d_{n-1}\\)]`.  Returns a new Tensor created by joining the input
+// strings with the given separator (default: empty string).  Negative indices are
+// counted backwards from the end, with `-1` being equivalent to `n - 1`.  If
+// indices are not specified, joins across all dimensions beginning from `n - 1`
+// through `0`.
+//
+// For example:
+//
+// ```python
+// # tensor `a` is [["a", "b"], ["c", "d"]]
+// tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
+// tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
+// tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
+// tf.reduce_join(a, [0, 1]) ==> "acbd"
+// tf.reduce_join(a, [1, 0]) ==> "abcd"
+// tf.reduce_join(a, []) ==> [["a", "b"], ["c", "d"]]
+// tf.reduce_join(a) = tf.reduce_join(a, [1, 0]) ==> "abcd"
+// ```
+//
+// Arguments:
+//	inputs: The input to be joined.  All reduced indices must have non-zero size.
+//	reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
+// order specified.  Omitting `reduction_indices` is equivalent to passing
+// `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
+//
+// Returns Has shape equal to that of the input with reduced dimensions removed or
+// set to `1` depending on `keep_dims`.
+func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, optional ...ReduceJoinAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ReduceJoin",
+		Input: []tf.Input{
+			inputs, reduction_indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes requantization range per channel.
+//
+// Arguments:
+//	input: The original input tensor.
+//	input_min: The minimum value of the input tensor
+//	input_max: The maximum value of the input tensor.
+//	clip_value_max: The maximum value of the output that needs to be clipped.
+// Example: set this to 6 for Relu6.
+//
+// Returns The minimum value of the final output tensorThe maximum value of the final output tensor.
+func RequantizationRangePerChannel(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, clip_value_max float32) (output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"clip_value_max": clip_value_max}
+	opspec := tf.OpSpec{
+		Type: "RequantizationRangePerChannel",
+		Input: []tf.Input{
+			input, input_min, input_max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Replaces the contents of the table with the specified keys and values.
+//
+// The tensor `keys` must be of the same type as the keys of the table.
+// The tensor `values` must be of the type of the table values.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//	values: Values to associate with keys.
+//
+// Returns the created operation.
+func LookupTableImportV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LookupTableImportV2",
+		Input: []tf.Input{
+			table_handle, keys, values,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes rectified linear: `max(features, 0)`.
+func Relu(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Relu",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the next representable value of `x1` in the direction of `x2`, element-wise.
+//
+// This operation returns the same result as the C++ std::nextafter function.
+//
+// It can also return a subnormal number.
+//
+// @compatibility(cpp)
+// Equivalent to C++ std::nextafter function.
+// @end_compatibility
+func NextAfter(scope *Scope, x1 tf.Output, x2 tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NextAfter",
+		Input: []tf.Input{
+			x1, x2,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Fetches multiple values from infeed as an XLA tuple.
+//
+// Arguments:
+//	dtypes: The element types of each element in `outputs`.
+//	shapes: The shapes of each tensor in `outputs`.
+//
+// Returns A list of tensors that will be provided using the infeed mechanism.
+func InfeedDequeueTuple(scope *Scope, dtypes []tf.DataType, shapes []tf.Shape) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes, "shapes": shapes}
+	opspec := tf.OpSpec{
+		Type: "InfeedDequeueTuple",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("InfeedDequeueTuple", err)
+		return
+	}
+	return outputs
+}
+
+// Rolls the elements of a tensor along an axis.
+//
+// The elements are shifted positively (towards larger indices) by the offset of
+// `shift` along the dimension of `axis`. Negative `shift` values will shift
+// elements in the opposite direction. Elements that roll passed the last position
+// will wrap around to the first and vice versa. Multiple shifts along multiple
+// axes may be specified.
+//
+// For example:
+//
+// ```
+// # 't' is [0, 1, 2, 3, 4]
+// roll(t, shift=2, axis=0) ==> [3, 4, 0, 1, 2]
+//
+// # shifting along multiple dimensions
+// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+// roll(t, shift=[1, -2], axis=[0, 1]) ==> [[7, 8, 9, 5, 6], [2, 3, 4, 0, 1]]
+//
+// # shifting along the same axis multiple times
+// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+// roll(t, shift=[2, -3], axis=[1, 1]) ==> [[1, 2, 3, 4, 0], [6, 7, 8, 9, 5]]
+// ```
+//
+// Arguments:
+//
+//	shift: Dimension must be 0-D or 1-D. `shift[i]` specifies the number of places by which
+// elements are shifted positively (towards larger indices) along the dimension
+// specified by `axis[i]`. Negative shifts will roll the elements in the opposite
+// direction.
+//	axis: Dimension must be 0-D or 1-D. `axis[i]` specifies the dimension that the shift
+// `shift[i]` should occur. If the same axis is referenced more than once, the
+// total shift for that axis will be the sum of all the shifts that belong to that
+// axis.
+//
+// Returns Has the same shape and size as the input. The elements are shifted
+// positively (towards larger indices) by the offsets of `shift` along the
+// dimensions of `axis`.
+func Roll(scope *Scope, input tf.Output, shift tf.Output, axis tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Roll",
+		Input: []tf.Input{
+			input, shift, axis,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StringSplitV2Attr is an optional argument to StringSplitV2.
+type StringSplitV2Attr func(optionalAttr)
+
+// StringSplitV2Maxsplit sets the optional maxsplit attribute to value.
+//
+// value: An `int`. If `maxsplit > 0`, limit of the split of the result.
+// If not specified, defaults to -1
+func StringSplitV2Maxsplit(value int64) StringSplitV2Attr {
+	return func(m optionalAttr) {
+		m["maxsplit"] = value
+	}
+}
+
+// Split elements of `source` based on `sep` into a `SparseTensor`.
+//
+// Let N be the size of source (typically N will be the batch size). Split each
+// element of `source` based on `sep` and return a `SparseTensor`
+// containing the split tokens. Empty tokens are ignored.
+//
+// For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
+// then the output will be
+// ```
+// st.indices = [0, 0;
+//               0, 1;
+//               1, 0;
+//               1, 1;
+//               1, 2]
+// st.shape = [2, 3]
+// st.values = ['hello', 'world', 'a', 'b', 'c']
+// ```
+//
+// If `sep` is given, consecutive delimiters are not grouped together and are
+// deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
+// sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
+// string, consecutive whitespace are regarded as a single separator, and the
+// result will contain no empty strings at the startor end if the string has
+// leading or trailing whitespace.
+//
+// Note that the above mentioned behavior matches python's str.split.
+//
+// Arguments:
+//	input: `1-D` string `Tensor`, the strings to split.
+//	sep: `0-D` string `Tensor`, the delimiter character.
+func StringSplitV2(scope *Scope, input tf.Output, sep tf.Output, optional ...StringSplitV2Attr) (indices tf.Output, values tf.Output, shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringSplitV2",
+		Input: []tf.Input{
+			input, sep,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Concatenates a list of `N` tensors along the first dimension.
+//
+// The input tensors are all required to have size 1 in the first dimension.
+//
+// For example:
+//
+// ```
+// # 'x' is [[1, 4]]
+// # 'y' is [[2, 5]]
+// # 'z' is [[3, 6]]
+// parallel_concat([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
+// ```
+//
+// The difference between concat and parallel_concat is that concat requires all
+// of the inputs be computed before the operation will begin but doesn't require
+// that the input shapes be known during graph construction.  Parallel concat
+// will copy pieces of the input into the output as they become available, in
+// some situations this can provide a performance benefit.
+//
+// Arguments:
+//	values: Tensors to be concatenated. All must have size 1 in the first dimension
+// and same shape.
+//	shape: the final shape of the result; should be equal to the shapes of any input
+// but with the number of input values in the first dimension.
+//
+// Returns The concatenated tensor.
+func ParallelConcat(scope *Scope, values []tf.Output, shape tf.Shape) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shape": shape}
+	opspec := tf.OpSpec{
+		Type: "ParallelConcat",
+		Input: []tf.Input{
+			tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adjust the contrast of one or more images.
+//
+// `images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
+// interpreted as `[height, width, channels]`.  The other dimensions only
+// represent a collection of images, such as `[batch, height, width, channels].`
+//
+// Contrast is adjusted independently for each channel of each image.
+//
+// For each channel, the Op first computes the mean of the image pixels in the
+// channel and then adjusts each component of each pixel to
+// `(x - mean) * contrast_factor + mean`.
+//
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	contrast_factor: A float multiplier for adjusting contrast.
+//
+// Returns The contrast-adjusted image or images.
+func AdjustContrastv2(scope *Scope, images tf.Output, contrast_factor tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AdjustContrastv2",
+		Input: []tf.Input{
+			images, contrast_factor,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
+//
+// The hash function is deterministic on the content of the string within the
+// process.
+//
+// Note that the hash function may change from time to time.
+// This functionality will be deprecated and it's recommended to use
+// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
+//
+// Arguments:
+//
+//	num_buckets: The number of buckets.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
+	opspec := tf.OpSpec{
+		Type: "StringToHashBucket",
+		Input: []tf.Input{
+			string_tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MutableHashTableV2Attr is an optional argument to MutableHashTableV2.
+type MutableHashTableV2Attr func(optionalAttr)
+
+// MutableHashTableV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableHashTableV2Container(value string) MutableHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MutableHashTableV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableHashTableV2SharedName(value string) MutableHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// MutableHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+//
+// value: If true and shared_name is empty, the table is shared
+// using the node name.
+// If not specified, defaults to false
+func MutableHashTableV2UseNodeNameSharing(value bool) MutableHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// Creates an empty hash table.
+//
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a scalar. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
+//
+// Arguments:
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func MutableHashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableV2Attr) (table_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MutableHashTableV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
+type MutableDenseHashTableV2Attr func(optionalAttr)
+
+// MutableDenseHashTableV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableDenseHashTableV2Container(value string) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MutableDenseHashTableV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableDenseHashTableV2SharedName(value string) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// MutableDenseHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// If not specified, defaults to false
+func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
+//
+// value: The shape of each value.
+// If not specified, defaults to <>
+func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["value_shape"] = value
+	}
+}
+
+// MutableDenseHashTableV2InitialNumBuckets sets the optional initial_num_buckets attribute to value.
+//
+// value: The initial number of hash table buckets. Must be a power
+// to 2.
+// If not specified, defaults to 131072
+func MutableDenseHashTableV2InitialNumBuckets(value int64) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["initial_num_buckets"] = value
+	}
+}
+
+// MutableDenseHashTableV2MaxLoadFactor sets the optional max_load_factor attribute to value.
+//
+// value: The maximum ratio between number of entries and number of
+// buckets before growing the table. Must be between 0 and 1.
+// If not specified, defaults to 0.8
+func MutableDenseHashTableV2MaxLoadFactor(value float32) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["max_load_factor"] = value
+	}
+}
+
+// Creates an empty hash table that uses tensors as the backing store.
+//
+// It uses "open addressing" with quadratic reprobing to resolve
+// collisions.
+//
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a scalar. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
+//
+// Arguments:
+//	empty_key: The key used to represent empty key buckets internally. Must not
+// be used in insert or lookup operations.
+//
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, deleted_key tf.Output, value_dtype tf.DataType, optional ...MutableDenseHashTableV2Attr) (table_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MutableDenseHashTableV2",
+		Input: []tf.Input{
+			empty_key, deleted_key,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MultinomialAttr is an optional argument to Multinomial.
+type MultinomialAttr func(optionalAttr)
+
+// MultinomialSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 is set to be non-zero, the internal random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func MultinomialSeed(value int64) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// MultinomialSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func MultinomialSeed2(value int64) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// MultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["output_dtype"] = value
+	}
+}
+
+// Draws samples from a multinomial distribution.
+//
+// Arguments:
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+//
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Multinomial",
+		Input: []tf.Input{
+			logits, num_samples,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Table initializer that takes two tensors for keys and values respectively.
+//
+// Arguments:
+//	table_handle: Handle to a table which will be initialized.
+//	keys: Keys of type Tkey.
+//	values: Values of type Tval.
+//
+// Returns the created operation.
+func InitializeTableV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InitializeTableV2",
+		Input: []tf.Input{
+			table_handle, keys, values,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// PrefetchDatasetAttr is an optional argument to PrefetchDataset.
+type PrefetchDatasetAttr func(optionalAttr)
+
+// PrefetchDatasetSlackPeriod sets the optional slack_period attribute to value.
+// If not specified, defaults to 0
+func PrefetchDatasetSlackPeriod(value int64) PrefetchDatasetAttr {
+	return func(m optionalAttr) {
+		m["slack_period"] = value
+	}
+}
+
+// Creates a dataset that asynchronously prefetches elements from `input_dataset`.
+//
+// Arguments:
+//
+//	buffer_size: The maximum number of elements to buffer in an iterator over
+// this dataset.
+//
+//
+func PrefetchDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...PrefetchDatasetAttr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "PrefetchDataset",
+		Input: []tf.Input{
+			input_dataset, buffer_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Elementwise computes the bitwise OR of `x` and `y`.
+//
+// The result will have those bits set, that are set in `x`, `y` or both. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BitwiseOr",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// InitializeTableFromTextFileV2Attr is an optional argument to InitializeTableFromTextFileV2.
+type InitializeTableFromTextFileV2Attr func(optionalAttr)
+
+// InitializeTableFromTextFileV2VocabSize sets the optional vocab_size attribute to value.
+//
+// value: Number of elements of the file, use -1 if unknown.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func InitializeTableFromTextFileV2VocabSize(value int64) InitializeTableFromTextFileV2Attr {
+	return func(m optionalAttr) {
+		m["vocab_size"] = value
+	}
+}
+
+// InitializeTableFromTextFileV2Delimiter sets the optional delimiter attribute to value.
+//
+// value: Delimiter to separate fields in a line.
+// If not specified, defaults to "\t"
+func InitializeTableFromTextFileV2Delimiter(value string) InitializeTableFromTextFileV2Attr {
+	return func(m optionalAttr) {
+		m["delimiter"] = value
+	}
+}
+
+// Initializes a table from a text file.
+//
+// It inserts one key-value pair into the table for each line of the file.
+// The key and value is extracted from the whole line content, elements from the
+// split line based on `delimiter` or the line number (starting from zero).
+// Where to extract the key and value from a line is specified by `key_index` and
+// `value_index`.
+//
+// - A value of -1 means use the line number(starting from zero), expects `int64`.
+// - A value of -2 means use the whole line content, expects `string`.
+// - A value >= 0 means use the index (starting at zero) of the split line based
+//   on `delimiter`.
+//
+// Arguments:
+//	table_handle: Handle to a table which will be initialized.
+//	filename: Filename of a vocabulary text file.
+//	key_index: Column index in a line to get the table `key` values from.
+//	value_index: Column index that represents information of a line to get the table
+// `value` values from.
+//
+// Returns the created operation.
+func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filename tf.Output, key_index int64, value_index int64, optional ...InitializeTableFromTextFileV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"key_index": key_index, "value_index": value_index}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "InitializeTableFromTextFileV2",
+		Input: []tf.Input{
+			table_handle, filename,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Makes the summary of quantiles for the batch.
+// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
+type CropAndResizeGradImageAttr func(optionalAttr)
+
+// CropAndResizeGradImageMethod sets the optional method attribute to value.
 //
-// An op that takes a list of tensors (one tensor per feature) and outputs the
-// quantile summaries for each tensor.
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
+	return func(m optionalAttr) {
+		m["method"] = value
+	}
+}
+
+// Computes the gradient of the crop_and_resize op wrt the input image tensor.
 //
 // Arguments:
-//	float_values: float; List of Rank 1 Tensors each containing values for a single feature.
-//	example_weights: float; Rank 1 Tensor with weights per instance.
-//	epsilon: float; The required maximum approximation error.
+//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
+// containing the original image size. Both `image_height` and `image_width` need
+// to be positive.
 //
-// Returns float; List of Rank 2 Tensors each containing the quantile summary
-// (value, weight, min_rank, max_rank) of a single feature.
-func BoostedTreesMakeQuantileSummaries(scope *Scope, float_values []tf.Output, example_weights tf.Output, epsilon tf.Output) (summaries []tf.Output) {
+//
+// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"T": T}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CropAndResizeGradImage",
+		Input: []tf.Input{
+			grads, boxes, box_ind, image_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// PrintV2Attr is an optional argument to PrintV2.
+type PrintV2Attr func(optionalAttr)
+
+// PrintV2OutputStream sets the optional output_stream attribute to value.
+//
+// value: A string specifying the output stream or logging level to print to.
+// If not specified, defaults to "stderr"
+func PrintV2OutputStream(value string) PrintV2Attr {
+	return func(m optionalAttr) {
+		m["output_stream"] = value
+	}
+}
+
+// PrintV2End sets the optional end attribute to value.
+// If not specified, defaults to "\n"
+func PrintV2End(value string) PrintV2Attr {
+	return func(m optionalAttr) {
+		m["end"] = value
+	}
+}
+
+// Prints a string scalar.
+//
+// Prints a string scalar to the desired output_stream.
+//
+// Arguments:
+//	input: The string scalar to print.
+//
+// Returns the created operation.
+func PrintV2(scope *Scope, input tf.Output, optional ...PrintV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "PrintV2",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
+//
+// Arguments:
+//	tag: A string attached to this summary. Used for organization in TensorBoard.
+//	tensor: A tensor to serialize.
+//	serialized_summary_metadata: A serialized SummaryMetadata proto. Contains plugin
+// data.
+func TensorSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, serialized_summary_metadata tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesMakeQuantileSummaries",
+		Type: "TensorSummaryV2",
 		Input: []tf.Input{
-			tf.OutputList(float_values), example_weights, epsilon,
+			tag, tensor, serialized_summary_metadata,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AudioSummaryV2Attr is an optional argument to AudioSummaryV2.
+type AudioSummaryV2Attr func(optionalAttr)
+
+// AudioSummaryV2MaxOutputs sets the optional max_outputs attribute to value.
+//
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func AudioSummaryV2MaxOutputs(value int64) AudioSummaryV2Attr {
+	return func(m optionalAttr) {
+		m["max_outputs"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with audio.
+//
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+//
+// Arguments:
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...AudioSummaryV2Attr) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AudioSummaryV2",
+		Input: []tf.Input{
+			tag, tensor, sample_rate,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Merges summaries.
+//
+// This op creates a
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// protocol buffer that contains the union of all the values in the input
+// summaries.
+//
+// When the Op is run, it reports an `InvalidArgument` error if multiple values
+// in the summaries to merge use the same tag.
+//
+// Arguments:
+//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
+// buffers.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MergeSummary",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates and returns an empty tensor list.
+//
+// All list elements must be tensors of dtype element_dtype and shape compatible
+// with element_shape.
+//
+// handle: an empty tensor list.
+// element_dtype: the type of elements in the list.
+// element_shape: a shape compatible with that of elements in the list.
+func EmptyTensorList(scope *Scope, element_shape tf.Output, max_num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "EmptyTensorList",
+		Input: []tf.Input{
+			element_shape, max_num_elements,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the number of tensors in the input tensor list.
+//
+// input_handle: the input list
+// length: the number of tensors in the list
+func TensorListLength(scope *Scope, input_handle tf.Output) (length tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListLength",
+		Input: []tf.Input{
+			input_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the last element of the input list as well as a list with all but that element.
+//
+// Fails if the list is empty.
+//
+// input_handle: the input list
+// tensor: the withdrawn last element of the list
+// element_dtype: the type of elements in the list
+// element_shape: the shape of the output tensor
+func TensorListPopBack(scope *Scope, input_handle tf.Output, element_shape tf.Output, element_dtype tf.DataType) (output_handle tf.Output, tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListPopBack",
+		Input: []tf.Input{
+			input_handle, element_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// NonDeterministicIntsAttr is an optional argument to NonDeterministicInts.
+type NonDeterministicIntsAttr func(optionalAttr)
+
+// NonDeterministicIntsDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_INT64
+func NonDeterministicIntsDtype(value tf.DataType) NonDeterministicIntsAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Non-deterministically generates some integers.
+//
+// This op may use some OS-provided source of non-determinism (e.g. an RNG), so each execution will give different results.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//
+// Returns Non-deterministic integer values with specified shape.
+func NonDeterministicInts(scope *Scope, shape tf.Output, optional ...NonDeterministicIntsAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "NonDeterministicInts",
+		Input: []tf.Input{
+			shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TensorListConcatAttr is an optional argument to TensorListConcat.
+type TensorListConcatAttr func(optionalAttr)
+
+// TensorListConcatElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorListConcatElementShape(value tf.Shape) TensorListConcatAttr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// Concats all tensors in the list along the 0th dimension.
+//
+// Requires that all tensors have the same shape except the first dimension.
+//
+// input_handle: The input list.
+// tensor: The concated result.
+// lengths: Output tensor containing sizes of the 0th dimension of tensors in the list, used for computing the gradient.
+//
+func TensorListConcat(scope *Scope, input_handle tf.Output, element_dtype tf.DataType, optional ...TensorListConcatAttr) (tensor tf.Output, lengths tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListConcat",
+		Input: []tf.Input{
+			input_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// An Op to permute tensors across replicated TPU instances.
+//
+// Each instance supplies its own input.
+//
+// For example, suppose there are 4 TPU instances: `[A, B, C, D]`. Passing
+// source_target_pairs=`[[0,1],[1,2],[2,3],[3,0]]` gets the outputs:
+// `[D, A, B, C]`.
+//
+// Arguments:
+//	input: The local input to be permuted. Currently only supports float and
+// bfloat16.
+//	source_target_pairs: A tensor with shape [num_pairs, 2].
+//
+// Returns The permuted input.
+func CollectivePermute(scope *Scope, input tf.Output, source_target_pairs tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "CollectivePermute",
+		Input: []tf.Input{
+			input, source_target_pairs,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Splits a tensor into a list.
+//
+// list[i] corresponds to lengths[i] tensors from the input tensor.
+// The tensor must have rank at least 1 and contain exactly sum(lengths) elements.
+//
+// tensor: The input tensor.
+// element_shape: A shape compatible with that of elements in the tensor.
+// lengths: Vector of sizes of the 0th dimension of tensors in the list.
+// output_handle: The list.
+func TensorListSplit(scope *Scope, tensor tf.Output, element_shape tf.Output, lengths tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListSplit",
+		Input: []tf.Input{
+			tensor, element_shape, lengths,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a TensorList which, when stacked, has the value of `tensor`.
+//
+// Each tensor in the result list corresponds to one row of the input tensor.
+//
+// tensor: The input tensor.
+// output_handle: The list.
+func TensorListFromTensor(scope *Scope, tensor tf.Output, element_shape tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListFromTensor",
+		Input: []tf.Input{
+			tensor, element_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Generates values in an interval.
+//
+// A sequence of `num` evenly-spaced values are generated beginning at `start`.
+// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
+// so that the last one is exactly `stop`.
+//
+// For example:
+//
+// ```
+// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
+// ```
+//
+// Arguments:
+//	start: 0-D tensor. First entry in the range.
+//	stop: 0-D tensor. Last entry in the range.
+//	num: 0-D tensor. Number of values to generate.
+//
+// Returns 1-D. The generated values.
+func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LinSpace",
+		Input: []tf.Input{
+			start, stop, num,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// List of the given size with empty elements.
+//
+// element_shape: the shape of the future elements of the list
+// num_elements: the number of elements to reserve
+// handle: the output list
+// element_dtype: the desired type of elements in the list.
+func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListReserve",
+		Input: []tf.Input{
+			element_shape, num_elements,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the item in the list with the given index.
+//
+// input_handle: the list
+// index: the position in the list from which an element will be retrieved
+// item: the element at that position
+//
+//
+func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, element_shape tf.Output, element_dtype tf.DataType) (item tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListGetItem",
+		Input: []tf.Input{
+			input_handle, index, element_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the value stored in an Optional variant or raises an error if none exists.
+func OptionalGetValue(scope *Scope, optional tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "OptionalGetValue",
+		Input: []tf.Input{
+			optional,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("OptionalGetValue", err)
+		return
+	}
+	return components
+}
+
+// Conv3DBackpropFilterAttr is an optional argument to Conv3DBackpropFilter.
+type Conv3DBackpropFilterAttr func(optionalAttr)
+
+// Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the filter.
+//
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropFilterV2
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv3DBackpropFilter",
+		Input: []tf.Input{
+			input, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Sets the index-th position of the list to contain the given tensor.
+//
+// input_handle: the list
+// index: the position in the list to which the tensor will be assigned
+// item: the element to be assigned to that position
+// output_handle: the new list, with the element in the proper position
+//
+func TensorListSetItem(scope *Scope, input_handle tf.Output, index tf.Output, item tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListSetItem",
+		Input: []tf.Input{
+			input_handle, index, item,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a Tensor by indexing into the TensorList.
+//
+// Each row in the produced Tensor corresponds to the element in the TensorList
+// specified by the given index (see `tf.gather`).
+//
+// input_handle: The input tensor list.
+// indices: The indices used to index into the list.
+// values: The tensor.
+func TensorListGather(scope *Scope, input_handle tf.Output, indices tf.Output, element_shape tf.Output, element_dtype tf.DataType) (values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListGather",
+		Input: []tf.Input{
+			input_handle, indices, element_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a TensorList by indexing into a Tensor.
+//
+// Each member of the TensorList corresponds to one row of the input tensor,
+// specified by the given index (see `tf.gather`).
+//
+// tensor: The input tensor.
+// indices: The indices used to index into the list.
+// element_shape: The shape of the elements in the list (can be less specified than
+//   the shape of the tensor).
+// output_handle: The TensorList.
+func TensorListScatter(scope *Scope, tensor tf.Output, indices tf.Output, element_shape tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListScatter",
+		Input: []tf.Input{
+			tensor, indices, element_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns which elements of x are NaN.
+//
+// @compatibility(numpy)
+// Equivalent to np.isnan
+// @end_compatibility
+func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IsNan",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a TensorList by indexing into a Tensor.
+//
+// Each member of the TensorList corresponds to one row of the input tensor,
+// specified by the given index (see `tf.gather`).
+//
+// tensor: The input tensor.
+// indices: The indices used to index into the list.
+// element_shape: The shape of the elements in the list (can be less specified than
+//   the shape of the tensor).
+// num_elements: The size of the output list. Must be large enough to accommodate
+//   the largest index in indices. If -1, the list is just large enough to include
+//   the largest index in indices.
+// output_handle: The TensorList.
+func TensorListScatterV2(scope *Scope, tensor tf.Output, indices tf.Output, element_shape tf.Output, num_elements tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListScatterV2",
+		Input: []tf.Input{
+			tensor, indices, element_shape, num_elements,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the sign and the log of the absolute value of the determinant of
+//
+// one or more square matrices.
+//
+// The input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions
+// form square matrices. The outputs are two tensors containing the signs and
+// absolute values of the log determinants for all N input submatrices
+// `[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).
+// The log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU
+// is the LU decomposition of the input and P is the corresponding
+// permutation matrix.
+//
+// Arguments:
+//	input: Shape is `[N, M, M]`.
+//
+// Returns The signs of the log determinants of the inputs. Shape is `[N]`.The logs of the absolute values of the determinants
+// of the N input matrices.  Shape is `[N]`.
+func LogMatrixDeterminant(scope *Scope, input tf.Output) (sign tf.Output, log_abs_determinant tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogMatrixDeterminant",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Outputs all keys and values in the table.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//
+//
+//
+// Returns Vector of all keys present in the table.Tensor of all values in the table. Indexed in parallel with `keys`.
+func LookupTableExportV2(scope *Scope, table_handle tf.Output, Tkeys tf.DataType, Tvalues tf.DataType) (keys tf.Output, values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"Tkeys": Tkeys, "Tvalues": Tvalues}
+	opspec := tf.OpSpec{
+		Type: "LookupTableExportV2",
+		Input: []tf.Input{
+			table_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// MatrixInverseAttr is an optional argument to MatrixInverse.
+type MatrixInverseAttr func(optionalAttr)
+
+// MatrixInverseAdjoint sets the optional adjoint attribute to value.
+// If not specified, defaults to false
+func MatrixInverseAdjoint(value bool) MatrixInverseAttr {
+	return func(m optionalAttr) {
+		m["adjoint"] = value
+	}
+}
+
+// Computes the inverse of one or more square invertible matrices or their
+//
+// adjoints (conjugate transposes).
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the inverse for all input submatrices `[..., :, :]`.
+//
+// The op uses LU decomposition with partial pivoting to compute the inverses.
+//
+// If a matrix is not invertible there is no guarantee what the op does. It
+// may detect the condition and raise an exception or it may simply return a
+// garbage result.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(numpy)
+// Equivalent to np.linalg.inv
+// @end_compatibility
+func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixInverse",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Bucketize each feature based on bucket boundaries.
+//
+// An op that returns a list of float tensors, where each tensor represents the
+// bucketized values for a single feature.
+//
+// Arguments:
+//	float_values: float; List of Rank 1 Tensor each containing float values for a single feature.
+//	bucket_boundaries: float; List of Rank 1 Tensors each containing the bucket boundaries for a single
+// feature.
+//
+// Returns int; List of Rank 1 Tensors each containing the bucketized values for a single feature.
+func BoostedTreesBucketize(scope *Scope, float_values []tf.Output, bucket_boundaries []tf.Output) (buckets []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesBucketize",
+		Input: []tf.Input{
+			tf.OutputList(float_values), tf.OutputList(bucket_boundaries),
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -18292,57 +36566,154 @@ func BoostedTreesMakeQuantileSummaries(scope *Scope, float_values []tf.Output, e
 	}
 	var idx int
 	var err error
-	if summaries, idx, err = makeOutputList(op, idx, "summaries"); err != nil {
-		scope.UpdateErr("BoostedTreesMakeQuantileSummaries", err)
+	if buckets, idx, err = makeOutputList(op, idx, "buckets"); err != nil {
+		scope.UpdateErr("BoostedTreesBucketize", err)
 		return
 	}
-	return summaries
+	return buckets
 }
 
-// StringSplitAttr is an optional argument to StringSplit.
-type StringSplitAttr func(optionalAttr)
+// Computes hyperbolic cosine of x element-wise.
+func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cosh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// StringSplitSkipEmpty sets the optional skip_empty attribute to value.
+// Set a summary_writer_interface to record statistics using given stats_aggregator.
 //
-// value: A `bool`. If `True`, skip the empty strings from the result.
-// If not specified, defaults to true
-func StringSplitSkipEmpty(value bool) StringSplitAttr {
+// Returns the created operation.
+func StatsAggregatorSetSummaryWriter(scope *Scope, stats_aggregator tf.Output, summary tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StatsAggregatorSetSummaryWriter",
+		Input: []tf.Input{
+			stats_aggregator, summary,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Deprecated, use python implementation tf.linalg.matrix_exponential.
+//
+// DEPRECATED at GraphDef version 27: Use Python implementation tf.linalg.matrix_exponential instead.
+func MatrixExponential(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixExponential",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the Cholesky decomposition of one or more square matrices.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices.
+//
+// The input has to be symmetric and positive definite. Only the lower-triangular
+// part of the input will be used for this operation. The upper-triangular part
+// will not be read.
+//
+// The output is a tensor of the same shape as the input
+// containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
+//
+// **Note**: The gradient computation on GPU is faster for large matrices but
+// not for large batch dimensions when the submatrices are small. In this
+// case it might be faster to use the CPU.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M, M]`.
+func Cholesky(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cholesky",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the Eigen Decomposition of a batch of square self-adjoint matrices.
+//
+// DEPRECATED at GraphDef version 11: Use SelfAdjointEigV2 instead.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices, with the same constraints as the single matrix
+// SelfAdjointEig.
+//
+// The result is a [..., M+1, M] matrix with [..., 0,:] containing the
+// eigenvalues, and subsequent [...,1:, :] containing the eigenvectors. The eigenvalues
+// are sorted in non-decreasing order.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M+1, M]`.
+func SelfAdjointEig(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SelfAdjointEig",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MatrixSolveAttr is an optional argument to MatrixSolve.
+type MatrixSolveAttr func(optionalAttr)
+
+// MatrixSolveAdjoint sets the optional adjoint attribute to value.
+//
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+// adjoint.
+// If not specified, defaults to false
+func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
 	return func(m optionalAttr) {
-		m["skip_empty"] = value
+		m["adjoint"] = value
 	}
 }
 
-// Split elements of `input` based on `delimiter` into a `SparseTensor`.
+// Solves systems of linear equations.
 //
-// Let N be the size of source (typically N will be the batch size). Split each
-// element of `input` based on `delimiter` and return a `SparseTensor`
-// containing the splitted tokens. Empty tokens are ignored.
-//
-// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
-//  empty string, each element of `input` is split into individual single-byte
-//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
-//  every character of `delimiter` is a potential split point.
-//
-// For example:
-//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
-//   will be
-//
-//   indices = [0, 0;
-//              0, 1;
-//              1, 0;
-//              1, 1;
-//              1, 2]
-//   shape = [2, 3]
-//   values = ['hello', 'world', 'a', 'b', 'c']
+// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
+// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
+// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `True` then each output matrix satisfies
+// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
 //
 // Arguments:
-//	input: 1-D. Strings to split.
-//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
 //
-// Returns A dense matrix of int64 representing the indices of the sparse tensor.A vector of strings corresponding to the splited values.a length-2 vector of int64 representing the shape of the sparse
-// tensor, where the first value is N and the second value is the maximum number
-// of tokens in a single input entry.
-func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ...StringSplitAttr) (indices tf.Output, values tf.Output, shape tf.Output) {
+// Returns Shape is `[..., M, K]`.
+func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18351,9 +36722,318 @@ func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ..
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringSplit",
+		Type: "MatrixSolve",
 		Input: []tf.Input{
-			input, delimiter,
+			matrix, rhs,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Enqueue a Tensor on the computation outfeed.
+//
+// Arguments:
+//	input: A tensor that will be inserted into the outfeed queue.
+//
+// Returns the created operation.
+func OutfeedEnqueue(scope *Scope, input tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "OutfeedEnqueue",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes the gradient of the sigmoid of `x` wrt its input.
+//
+// Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
+// `dy` is the corresponding input gradient.
+func SigmoidGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SigmoidGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CumsumAttr is an optional argument to Cumsum.
+type CumsumAttr func(optionalAttr)
+
+// CumsumExclusive sets the optional exclusive attribute to value.
+//
+// value: If `True`, perform exclusive cumsum.
+// If not specified, defaults to false
+func CumsumExclusive(value bool) CumsumAttr {
+	return func(m optionalAttr) {
+		m["exclusive"] = value
+	}
+}
+
+// CumsumReverse sets the optional reverse attribute to value.
+//
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumsumReverse(value bool) CumsumAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
+	}
+}
+
+// Compute the cumulative sum of the tensor `x` along `axis`.
+//
+// By default, this op performs an inclusive cumsum, which means that the first
+// element of the input is identical to the first element of the output:
+//
+// ```python
+// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
+// ```
+//
+// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
+// performed instead:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
+// ```
+//
+// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Cumsum",
+		Input: []tf.Input{
+			x, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QrAttr is an optional argument to Qr.
+type QrAttr func(optionalAttr)
+
+// QrFullMatrices sets the optional full_matrices attribute to value.
+//
+// value: If true, compute full-sized `q` and `r`. If false
+// (the default), compute only the leading `P` columns of `q`.
+// If not specified, defaults to false
+func QrFullMatrices(value bool) QrAttr {
+	return func(m optionalAttr) {
+		m["full_matrices"] = value
+	}
+}
+
+// Computes the QR decompositions of one or more matrices.
+//
+// Computes the QR decomposition of each inner matrix in `tensor` such that
+// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
+//
+// ```python
+// # a is a tensor.
+// # q is a tensor of orthonormal matrices.
+// # r is a tensor of upper triangular matrices.
+// q, r = qr(a)
+// q_full, r_full = qr(a, full_matrices=True)
+// ```
+//
+// Arguments:
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+//
+// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
+// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
+// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
+func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Qr",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
+type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
+
+// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// DepthwiseConv2dNativeBackpropFilterDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of depthwise convolution with respect to the filter.
+//
+// Arguments:
+//	input: 4-D with shape based on `data_format`.  For example, if
+// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
+// in_width, in_channels]` tensor.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DepthwiseConv2dNativeBackpropFilter",
+		Input: []tf.Input{
+			input, filter_sizes, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SvdAttr is an optional argument to Svd.
+type SvdAttr func(optionalAttr)
+
+// SvdComputeUv sets the optional compute_uv attribute to value.
+//
+// value: If true, left and right singular vectors will be
+// computed and returned in `u` and `v`, respectively.
+// If false, `u` and `v` are not set and should never referenced.
+// If not specified, defaults to true
+func SvdComputeUv(value bool) SvdAttr {
+	return func(m optionalAttr) {
+		m["compute_uv"] = value
+	}
+}
+
+// SvdFullMatrices sets the optional full_matrices attribute to value.
+//
+// value: If true, compute full-sized `u` and `v`. If false
+// (the default), compute only the leading `P` singular vectors.
+// Ignored if `compute_uv` is `False`.
+// If not specified, defaults to false
+func SvdFullMatrices(value bool) SvdAttr {
+	return func(m optionalAttr) {
+		m["full_matrices"] = value
+	}
+}
+
+// Computes the singular value decompositions of one or more matrices.
+//
+// Computes the SVD of each inner matrix in `input` such that
+// `input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
+//
+// ```python
+// # a is a tensor containing a batch of matrices.
+// # s is a tensor of singular values for each matrix.
+// # u is the tensor containing of left singular vectors for each matrix.
+// # v is the tensor containing of right singular vectors for each matrix.
+// s, u, v = svd(a)
+// s, _, _ = svd(a, compute_uv=False)
+// ```
+//
+// Arguments:
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+//
+// Returns Singular values. Shape is `[..., P]`.Left singular vectors. If `full_matrices` is `False` then shape is
+// `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`. Undefined if `compute_uv` is `False`.Left singular vectors. If `full_matrices` is `False` then shape is
+// `[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.
+// Undefined if `compute_uv` is false.
+func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.Output, v tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Svd",
+		Input: []tf.Input{
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -18361,6 +37041,326 @@ func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ..
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// Calculate product with tridiagonal matrix.
+//
+// Calculates product of two matrices, where left matrix is a tridiagonal matrix.
+//
+// Arguments:
+//	superdiag: Tensor of shape `[..., 1, M]`, representing superdiagonals of
+// tri-diagonal matrices to the left of multiplication. Last element is ingored.
+//	maindiag: Tensor of shape `[..., 1, M]`, representing main diagonals of tri-diagonal
+// matrices to the left of multiplication.
+//	subdiag: Tensor of shape `[..., 1, M]`, representing subdiagonals of tri-diagonal
+// matrices to the left of multiplication. First element is ingored.
+//	rhs: Tensor of shape `[..., M, N]`, representing MxN matrices to the right of
+// multiplication.
+//
+// Returns Tensor of shape `[..., M, N]` containing the product.
+func TridiagonalMatMul(scope *Scope, superdiag tf.Output, maindiag tf.Output, subdiag tf.Output, rhs tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TridiagonalMatMul",
+		Input: []tf.Input{
+			superdiag, maindiag, subdiag, rhs,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EuclideanNormAttr is an optional argument to EuclideanNorm.
+type EuclideanNormAttr func(optionalAttr)
+
+// EuclideanNormKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func EuclideanNormKeepDims(value bool) EuclideanNormAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the euclidean norm of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func EuclideanNorm(scope *Scope, input tf.Output, axis tf.Output, optional ...EuclideanNormAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EuclideanNorm",
+		Input: []tf.Input{
+			input, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the set of files matching one or more glob patterns.
+//
+// Note that this routine only supports wildcard characters in the
+// basename portion of the pattern, not in the directory portion.
+// Note also that the order of filenames returned can be non-deterministic.
+//
+// Arguments:
+//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
+//
+// Returns A vector of matching filenames.
+func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatchingFiles",
+		Input: []tf.Input{
+			pattern,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// A container for an iterator resource.
+//
+// Returns A handle to the iterator that can be passed to a "MakeIterator"
+// or "IteratorGetNext" op.
+func Iterator(scope *Scope, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "Iterator",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Counts the number of occurrences of each value in an integer array.
+//
+// Outputs a vector with length `size` and the same dtype as `weights`. If
+// `weights` are empty, then index `i` stores the number of times the value `i` is
+// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
+// the value in `weights` at each index where the corresponding value in `arr` is
+// `i`.
+//
+// Values in `arr` outside of the range [0, size) are ignored.
+//
+// Arguments:
+//	arr: int32 `Tensor`.
+//	size: non-negative int32 scalar `Tensor`.
+//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
+// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
+// equal to 1.
+//
+// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
+// each value in the range [0, size).
+func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Bincount",
+		Input: []tf.Input{
+			arr, size, weights,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// JPEG encode input image with provided compression quality.
+//
+// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
+// `quality` is an int32 jpeg compression quality value between 0 and 100.
+//
+//
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	quality: An int quality to encode to.
+//
+// Returns 0-D. JPEG-encoded image.
+func EncodeJpegVariableQuality(scope *Scope, images tf.Output, quality tf.Output) (contents tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "EncodeJpegVariableQuality",
+		Input: []tf.Input{
+			images, quality,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RestoreSliceAttr is an optional argument to RestoreSlice.
+type RestoreSliceAttr func(optionalAttr)
+
+// RestoreSlicePreferredShard sets the optional preferred_shard attribute to value.
+//
+// value: Index of file to open first if multiple files match
+// `file_pattern`. See the documentation for `Restore`.
+// If not specified, defaults to -1
+func RestoreSlicePreferredShard(value int64) RestoreSliceAttr {
+	return func(m optionalAttr) {
+		m["preferred_shard"] = value
+	}
+}
+
+// Restores a tensor from checkpoint files.
+//
+// This is like `Restore` except that restored tensor can be listed as filling
+// only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
+// larger tensor and the slice that the restored tensor covers.
+//
+// The `shape_and_slice` input has the same format as the
+// elements of the `shapes_and_slices` input of the `SaveSlices` op.
+//
+// Arguments:
+//	file_pattern: Must have a single element. The pattern of the files from
+// which we read the tensor.
+//	tensor_name: Must have a single element. The name of the tensor to be
+// restored.
+//	shape_and_slice: Scalar. The shapes and slice specifications to use when
+// restoring a tensors.
+//	dt: The type of the tensor to be restored.
+//
+// Returns The restored tensor.
+func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, shape_and_slice tf.Output, dt tf.DataType, optional ...RestoreSliceAttr) (tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dt": dt}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RestoreSlice",
+		Input: []tf.Input{
+			file_pattern, tensor_name, shape_and_slice,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Generate a glob pattern matching all sharded file names.
+func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ShardedFilespec",
+		Input: []tf.Input{
+			basename, num_shards,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TensorArrayConcatV2Attr is an optional argument to TensorArrayConcatV2.
+type TensorArrayConcatV2Attr func(optionalAttr)
+
+// TensorArrayConcatV2ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayConcatV2ElementShapeExcept0(value tf.Shape) TensorArrayConcatV2Attr {
+	return func(m optionalAttr) {
+		m["element_shape_except0"] = value
+	}
+}
+
+// Deprecated. Use TensorArrayConcatV3
+func TensorArrayConcatV2(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV2Attr) (value tf.Output, lengths tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayConcatV2",
+		Input: []tf.Input{
+			handle, flow_in,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// WholeFileReaderV2Attr is an optional argument to WholeFileReaderV2.
+type WholeFileReaderV2Attr func(optionalAttr)
+
+// WholeFileReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func WholeFileReaderV2Container(value string) WholeFileReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// WholeFileReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func WholeFileReaderV2SharedName(value string) WholeFileReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the entire contents of a file as a value.
+//
+// To use, enqueue filenames in a Queue.  The output of ReaderRead will
+// be a filename (key) and the contents of that file (value).
+//
+// Returns The handle to reference the Reader.
+func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "WholeFileReaderV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
 type SampleDistortedBoundingBoxAttr func(optionalAttr)
 
@@ -18514,78 +37514,51 @@ func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_box
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ResourceApplyKerasMomentumAttr is an optional argument to ResourceApplyKerasMomentum.
-type ResourceApplyKerasMomentumAttr func(optionalAttr)
-
-// ResourceApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
+// Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyKerasMomentumUseLocking(value bool) ResourceApplyKerasMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
+// The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
+// `filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
+// input channel is processed independently of the others with its own structuring
+// function. The `output` tensor has shape
+// `[batch, out_height, out_width, depth]`. The spatial dimensions of the output
+// tensor depend on the `padding` algorithm. We currently only support the default
+// "NHWC" `data_format`.
 //
-// value: If `True`, the tensor passed to compute grad will be
-// var + momentum * accum, so in the end, the var you get is actually
-// var + momentum * accum.
-// If not specified, defaults to false
-func ResourceApplyKerasMomentumUseNesterov(value bool) ResourceApplyKerasMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
+// In detail, the grayscale morphological 2-D dilation is the max-sum correlation
+// (for consistency with `conv2d`, we use unmirrored filters):
 //
-// want to use Nesterov momentum.
+//     output[b, y, x, c] =
+//        max_{dy, dx} input[b,
+//                           strides[1] * y + rates[1] * dy,
+//                           strides[2] * x + rates[2] * dx,
+//                           c] +
+//                     filter[dy, dx, c]
 //
-// accum = accum * momentum - lr * grad
-// var += accum
+// Max-pooling is a special case when the filter has size equal to the pooling
+// kernel size and contains all zeros.
+//
+// Note on duality: The dilation of `input` by the `filter` is equal to the
+// negation of the erosion of `-input` by the reflected `filter`.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
-//	momentum: Momentum. Must be a scalar.
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: The input stride for atrous morphological dilation. Must be:
+// `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns the created operation.
-func ResourceApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyKerasMomentumAttr) (o *tf.Operation) {
+// Returns 4-D with shape `[batch, out_height, out_width, depth]`.
+func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, rates []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyKerasMomentum",
+		Type: "Dilation2D",
 		Input: []tf.Input{
-			var_, accum, lr, grad, momentum,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Deprecated. Use TensorArrayReadV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArrayReadV3
-func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayReadV2",
-		Input: []tf.Input{
-			handle, index, flow_in,
+			input, filter,
 		},
 		Attrs: attrs,
 	}
@@ -18593,86 +37566,119 @@ func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in
 	return op.Output(0)
 }
 
-// Pop the element at the top of the stack.
+// Concatenates tensors along one dimension.
 //
 // Arguments:
-//	handle: The handle to a stack.
-//	elem_type: The type of the elem that is popped.
+//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [-rank(values), rank(values)).
 //
-// Returns The tensor that is popped from the top of the stack.
-func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.Output) {
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.
+func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"elem_type": elem_type}
 	opspec := tf.OpSpec{
-		Type: "StackPopV2",
+		Type: "ConcatV2",
 		Input: []tf.Input{
-			handle,
+			tf.OutputList(values), axis,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// EnqueueTPUEmbeddingSparseBatchAttr is an optional argument to EnqueueTPUEmbeddingSparseBatch.
-type EnqueueTPUEmbeddingSparseBatchAttr func(optionalAttr)
+// QueueDequeueManyV2Attr is an optional argument to QueueDequeueManyV2.
+type QueueDequeueManyV2Attr func(optionalAttr)
 
-// EnqueueTPUEmbeddingSparseBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
+// QueueDequeueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// value: The TPU device to use. Should be >= 0 and less than the number
-// of TPU cores in the task on which the node is placed.
+// value: If the queue has fewer than n elements, this operation
+// will block for up to timeout_ms milliseconds.
+// Note: This option is not supported yet.
 // If not specified, defaults to -1
-func EnqueueTPUEmbeddingSparseBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingSparseBatchAttr {
+func QueueDequeueManyV2TimeoutMs(value int64) QueueDequeueManyV2Attr {
 	return func(m optionalAttr) {
-		m["device_ordinal"] = value
+		m["timeout_ms"] = value
 	}
 }
 
-// EnqueueTPUEmbeddingSparseBatchCombiners sets the optional combiners attribute to value.
+// Dequeues `n` tuples of one or more tensors from the given queue.
 //
-// value: A list of string scalars, one for each embedding table that specify
-// how to normalize the embedding activations after weighted summation.
-// Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
-// the sum of the weights be 0 for 'mean' or the sum of the squared weights be
-// 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
-// all tables.
-// If not specified, defaults to <>
-func EnqueueTPUEmbeddingSparseBatchCombiners(value []string) EnqueueTPUEmbeddingSparseBatchAttr {
-	return func(m optionalAttr) {
-		m["combiners"] = value
-	}
-}
-
-// An op that enqueues TPUEmbedding input indices from a SparseTensor.
+// If the queue is closed and there are fewer than `n` elements, then an
+// OutOfRange error is returned.
 //
-// This Op eases the porting of code that uses embedding_lookup_sparse(),
-// although some Python preprocessing of the SparseTensor arguments to
-// embedding_lookup_sparse() is required to produce the arguments to this Op,
-// since only a single EnqueueTPUEmbeddingSparseBatch Op is allowed per training
-// step.
+// This operation concatenates queue-element component tensors along the
+// 0th dimension to make a single component tensor.  All of the components
+// in the dequeued tuple will have size `n` in the 0th dimension.
 //
-// The tensors at corresponding positions in the three input lists
-// must have the same shape, i.e. rank 1 with dim_size() equal to the total
-// number of lookups into the table described by the corresponding table_id.
+// This operation has `k` outputs, where `k` is the number of components in
+// the tuples stored in the given queue, and output `i` is the ith
+// component of the dequeued tuple.
+//
+// N.B. If the queue is empty, this operation will block until `n` elements
+// have been dequeued (or 'timeout_ms' elapses, if specified).
 //
 // Arguments:
-//	sample_indices: A list of rank 1 Tensors specifying the training example and
-// feature to which the corresponding embedding_indices and aggregation_weights
-// values belong. sample_indices[i] must equal b * nf + f, where nf is the
-// number of features from the corresponding table, f is in [0, nf), and
-// b is in [0, batch size).
-//	embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
-//	aggregation_weights: A list of rank 1 Tensors containing per sample -- i.e. per
-// (training example, feature) -- aggregation weights.
-//	mode_override: A string input that overrides the mode specified in the
-// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
-// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
-// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
+//	handle: The handle to a queue.
+//	n: The number of tuples to dequeue.
+//	component_types: The type of each component in a tuple.
 //
-// Returns the created operation.
-func EnqueueTPUEmbeddingSparseBatch(scope *Scope, sample_indices []tf.Output, embedding_indices []tf.Output, aggregation_weights []tf.Output, mode_override tf.Output, optional ...EnqueueTPUEmbeddingSparseBatchAttr) (o *tf.Operation) {
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueManyV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueManyV2Attr) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueDequeueManyV2",
+		Input: []tf.Input{
+			handle, n,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueManyV2", err)
+		return
+	}
+	return components
+}
+
+// SizeAttr is an optional argument to Size.
+type SizeAttr func(optionalAttr)
+
+// SizeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func SizeOutType(value tf.DataType) SizeAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Returns the size of a tensor.
+//
+// This operation returns an integer representing the number of elements in
+// `input`.
+//
+// For example:
+//
+// ```
+// # 't' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]
+// size(t) ==> 12
+// ```
+func Size(scope *Scope, input tf.Output, optional ...SizeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18681,505 +37687,115 @@ func EnqueueTPUEmbeddingSparseBatch(scope *Scope, sample_indices []tf.Output, em
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EnqueueTPUEmbeddingSparseBatch",
-		Input: []tf.Input{
-			tf.OutputList(sample_indices), tf.OutputList(embedding_indices), tf.OutputList(aggregation_weights), mode_override,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Creates a dataset that emits the outputs of `input_dataset` `count` times.
-//
-// Arguments:
-//
-//	count: A scalar representing the number of times that `input_dataset` should
-// be repeated. A value of `-1` indicates that it should be repeated infinitely.
-//
-//
-func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "RepeatDataset",
-		Input: []tf.Input{
-			input_dataset, count,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyCenteredRMSPropAttr is an optional argument to ResourceApplyCenteredRMSProp.
-type ResourceApplyCenteredRMSPropAttr func(optionalAttr)
-
-// ResourceApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, mg, ms, and mom tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the centered RMSProp algorithm.
-//
-// The centered RMSProp algorithm uses an estimate of the centered second moment
-// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
-// uses the (uncentered) second moment. This often helps with training, but is
-// slightly more expensive in terms of computation and memory.
-//
-// Note that in dense implementation of this algorithm, mg, ms, and mom will
-// update even if the grad is zero, but in this sparse implementation, mg, ms,
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// mean_grad = decay * mean_grad + (1-decay) * gradient
-//
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
-//
-// mg <- rho * mg_{t-1} + (1-rho) * grad
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
-// var <- var - mom
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	mg: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyCenteredRMSProp",
-		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// The gradient operator for the SparseSlice op.
-//
-// This op takes in the upstream gradient w.r.t. non-empty values of
-// the sliced `SparseTensor`, and outputs the gradients w.r.t.
-// the non-empty values of input `SparseTensor`.
-//
-// Arguments:
-//	backprop_val_grad: 1-D. The gradient with respect to
-// the non-empty values of the sliced `SparseTensor`.
-//	input_indices: 2-D.  The `indices` of the input `SparseTensor`.
-//	input_start: 1-D. tensor represents the start of the slice.
-//	output_indices: 2-D.  The `indices` of the sliced `SparseTensor`.
-//
-// Returns 1-D. The gradient with respect to the non-empty values of input `SparseTensor`.
-func SparseSliceGrad(scope *Scope, backprop_val_grad tf.Output, input_indices tf.Output, input_start tf.Output, output_indices tf.Output) (val_grad tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSliceGrad",
-		Input: []tf.Input{
-			backprop_val_grad, input_indices, input_start, output_indices,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Provides the time since epoch in seconds.
-//
-// Returns the timestamp as a `float64` for seconds since the Unix epoch.
-//
-// Note: the timestamp is computed when the op is executed, not when it is added
-// to the graph.
-func Timestamp(scope *Scope) (ts tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Timestamp",
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
-type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
-
-// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
-//
-// That is for rows we have grad for, we update var and accum as follows:
-// accum += grad * grad
-// prox_v = var
-// prox_v -= lr * grad * (1 / sqrt(accum))
-// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//
-// Returns the created operation.
-func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyProximalAdagrad",
-		Input: []tf.Input{
-			var_, accum, lr, l1, l2, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// LoadTPUEmbeddingCenteredRMSPropParametersAttr is an optional argument to LoadTPUEmbeddingCenteredRMSPropParameters.
-type LoadTPUEmbeddingCenteredRMSPropParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingCenteredRMSPropParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingCenteredRMSPropParametersTableId(value int64) LoadTPUEmbeddingCenteredRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingCenteredRMSPropParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingCenteredRMSPropParametersTableName(value string) LoadTPUEmbeddingCenteredRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load centered RMSProp embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the centered RMSProp optimization algorithm.
-//	ms: Value of ms used in the centered RMSProp optimization algorithm.
-//	mom: Value of mom used in the centered RMSProp optimization algorithm.
-//	mg: Value of mg used in the centered RMSProp optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingCenteredRMSPropParameters(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, mg tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingCenteredRMSPropParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingCenteredRMSPropParameters",
-		Input: []tf.Input{
-			parameters, ms, mom, mg,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// ResourceSparseApplyKerasMomentumAttr is an optional argument to ResourceSparseApplyKerasMomentum.
-type ResourceSparseApplyKerasMomentumAttr func(optionalAttr)
-
-// ResourceSparseApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyKerasMomentumUseLocking(value bool) ResourceSparseApplyKerasMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceSparseApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, the tensor passed to compute grad will be
-// var + momentum * accum, so in the end, the var you get is actually
-// var + momentum * accum.
-// If not specified, defaults to false
-func ResourceSparseApplyKerasMomentumUseNesterov(value bool) ResourceSparseApplyKerasMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
-//
-// Set use_nesterov = True if you want to use Nesterov momentum.
-//
-// That is for rows we have grad for, we update var and accum as follows:
-//
-// accum = accum * momentum - lr * grad
-// var += accum
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	momentum: Momentum. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceSparseApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyKerasMomentumAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyKerasMomentum",
-		Input: []tf.Input{
-			var_, accum, lr, grad, indices, momentum,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// SparseReduceMaxAttr is an optional argument to SparseReduceMax.
-type SparseReduceMaxAttr func(optionalAttr)
-
-// SparseReduceMaxKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceMaxKeepDims(value bool) SparseReduceMaxAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the max of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
-// instead of a sparse one.
-//
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
-//
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-//
-// Returns `R-K`-D.  The reduced Tensor.
-func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseReduceMax",
-		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Assigns a new value to a variable.
-//
-// Any ReadVariableOp with a control dependency on this op is guaranteed to return
-// this value or a subsequent newer value of the variable.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value to set the new tensor to use.
-//
-// Returns the created operation.
-func AssignVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AssignVariableOp",
-		Input: []tf.Input{
-			resource, value,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes softmax activations.
-//
-// For each batch `i` and class `j` we have
-//
-//     $$softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))$$
-//
-// Arguments:
-//	logits: 2-D with shape `[batch_size, num_classes]`.
-//
-// Returns Same shape as `logits`.
-func Softmax(scope *Scope, logits tf.Output) (softmax tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Softmax",
-		Input: []tf.Input{
-			logits,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug.
-type RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve proximal Adagrad embedding parameters with debug support.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the proximal Adagrad optimization algorithm.Parameter accumulators updated by the proximal Adagrad optimization algorithm.Parameter gradient_accumulators updated by the proximal Adagrad optimization algorithm.
-func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// 3D fast Fourier transform.
-//
-// Computes the 3-dimensional discrete Fourier transform over the inner-most 3
-// dimensions of `input`.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
-//   dimensions of `input` are replaced with their 3D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.fftn with 3 dimensions.
-// @end_compatibility
-func FFT3D(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "FFT3D",
+		Type: "Size",
 		Input: []tf.Input{
 			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of (x <= y) element-wise.
+// TFRecordReaderV2Attr is an optional argument to TFRecordReaderV2.
+type TFRecordReaderV2Attr func(optionalAttr)
+
+// TFRecordReaderV2Container sets the optional container attribute to value.
 //
-// *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func TFRecordReaderV2Container(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// TFRecordReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func TFRecordReaderV2SharedName(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// TFRecordReaderV2CompressionType sets the optional compression_type attribute to value.
+// If not specified, defaults to ""
+func TFRecordReaderV2CompressionType(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["compression_type"] = value
+	}
+}
+
+// A Reader that outputs the records from a TensorFlow Records file.
+//
+// Returns The handle to reference the Reader.
+func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TFRecordReaderV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// An op that receives embedding activations on the TPU.
+//
+// The TPU system performs the embedding lookups and aggregations specified by
+// the arguments to TPUEmbeddingEnqueue(Integer/Sparse/SparseTensor)Batch. The
+// results of these aggregations are visible to the Tensorflow Graph as the
+// outputs of a RecvTPUEmbeddingActivations op. This op returns a list containing
+// one Tensor of activations per table specified in the model. There can be at
+// most one RecvTPUEmbeddingActivations op in the TPU graph.
+//
+// Arguments:
+//	num_outputs: The number of output activation tensors, equal to the number of
+// embedding tables in the model.
+//	config: Serialized TPUEmbeddingConfiguration proto.
+//
+// Returns A TensorList of embedding activations containing one Tensor per
+// embedding table in the model.
+func RecvTPUEmbeddingActivations(scope *Scope, num_outputs int64, config string) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_outputs": num_outputs, "config": config}
+	opspec := tf.OpSpec{
+		Type: "RecvTPUEmbeddingActivations",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("RecvTPUEmbeddingActivations", err)
+		return
+	}
+	return outputs
+}
+
+// Returns 0 if x == 0, and x * log(y) otherwise, elementwise.
+func Xlogy(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LessEqual",
+		Type: "Xlogy",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -19188,182 +37804,109 @@ func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the matrix logarithm of one or more square matrices:
-//
-//
-// \\(log(exp(A)) = A\\)
-//
-// This op is only defined for complex matrices. If A is positive-definite and
-// real, then casting to a complex matrix, taking the logarithm and casting back
-// to a real matrix will give the correct result.
-//
-// This function computes the matrix logarithm using the Schur-Parlett algorithm.
-// Details of the algorithm can be found in Section 11.6.2 of:
-// Nicholas J. Higham, Functions of Matrices: Theory and Computation, SIAM 2008.
-// ISBN 978-0-898716-46-7.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the exponential for all input submatrices `[..., :, :]`.
+// Gets next element for the provided shard number.
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
+//	multi_device_iterator: A MultiDeviceIterator resource.
+//	shard_num: Integer representing which shard to fetch data for.
+//	incarnation_id: Which incarnation of the MultiDeviceIterator is running.
+//	output_types: The type list for the return values.
+//	output_shapes: The list of shapes being produced.
 //
-// Returns Shape is `[..., M, M]`.
+// Returns Result of the get_next on the dataset.
+func MultiDeviceIteratorGetNextFromShard(scope *Scope, multi_device_iterator tf.Output, shard_num tf.Output, incarnation_id tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "MultiDeviceIteratorGetNextFromShard",
+		Input: []tf.Input{
+			multi_device_iterator, shard_num, incarnation_id,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("MultiDeviceIteratorGetNextFromShard", err)
+		return
+	}
+	return components
+}
+
+// Returns x * y element-wise.
 //
-// @compatibility(scipy)
-// Equivalent to scipy.linalg.logm
-// @end_compatibility
-func MatrixLogarithm(scope *Scope, input tf.Output) (output tf.Output) {
+// *NOTE*: `Multiply` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixLogarithm",
+		Type: "Mul",
 		Input: []tf.Input{
-			input,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// PriorityQueueV2Attr is an optional argument to PriorityQueueV2.
-type PriorityQueueV2Attr func(optionalAttr)
+// DataFormatVecPermuteAttr is an optional argument to DataFormatVecPermute.
+type DataFormatVecPermuteAttr func(optionalAttr)
 
-// PriorityQueueV2ComponentTypes sets the optional component_types attribute to value.
+// DataFormatVecPermuteSrcFormat sets the optional src_format attribute to value.
 //
-// value: The type of each component in a value.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func PriorityQueueV2ComponentTypes(value []tf.DataType) PriorityQueueV2Attr {
+// value: source data format.
+// If not specified, defaults to "NHWC"
+func DataFormatVecPermuteSrcFormat(value string) DataFormatVecPermuteAttr {
 	return func(m optionalAttr) {
-		m["component_types"] = value
+		m["src_format"] = value
 	}
 }
 
-// PriorityQueueV2Capacity sets the optional capacity attribute to value.
+// DataFormatVecPermuteDstFormat sets the optional dst_format attribute to value.
 //
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func PriorityQueueV2Capacity(value int64) PriorityQueueV2Attr {
+// value: destination data format.
+// If not specified, defaults to "NCHW"
+func DataFormatVecPermuteDstFormat(value string) DataFormatVecPermuteAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["dst_format"] = value
 	}
 }
 
-// PriorityQueueV2Container sets the optional container attribute to value.
+// Returns the permuted vector/tensor in the destination data format given the
 //
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func PriorityQueueV2Container(value string) PriorityQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// PriorityQueueV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func PriorityQueueV2SharedName(value string) PriorityQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A queue that produces elements sorted by the first component value.
-//
-// Note that the PriorityQueue requires the first component of any element
-// to be a scalar int64, in addition to the other elements declared by
-// component_types.  Therefore calls to Enqueue and EnqueueMany (resp. Dequeue
-// and DequeueMany) on a PriorityQueue will all require (resp. output) one extra
-// entry in their input (resp. output) lists.
+// one in the source data format.
 //
 // Arguments:
-//	shapes: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
+//	x: Vector of size 4 or Tensor of shape (4, 2) in source data format.
 //
-// Returns The handle to the queue.
-func PriorityQueueV2(scope *Scope, shapes []tf.Shape, optional ...PriorityQueueV2Attr) (handle tf.Output) {
+// Returns Vector of size 4 or Tensor of shape (4, 2) in destination data format.
+func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPermuteAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shapes": shapes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "PriorityQueueV2",
-
+		Type: "DataFormatVecPermute",
+		Input: []tf.Input{
+			x,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingAdadeltaParametersGradAccumDebug.
-type LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr func(optionalAttr)
-
-// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load Adadelta parameters with debug support.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the Adadelta optimization algorithm.
-//	accumulators: Value of accumulators used in the Adadelta optimization algorithm.
-//	updates: Value of updates used in the Adadelta optimization algorithm.
-//	gradient_accumulators: Value of gradient_accumulators used in the Adadelta optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingAdadeltaParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, updates tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingAdadeltaParametersGradAccumDebug",
-		Input: []tf.Input{
-			parameters, accumulators, updates, gradient_accumulators,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
 type ResourceApplyAdagradAttr func(optionalAttr)
 
@@ -19417,1916 +37960,122 @@ func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.O
 	return scope.AddOperation(opspec)
 }
 
-// Creates a dataset that splits a SparseTensor into elements row-wise.
-func SparseTensorSliceDataset(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output) (handle tf.Output) {
+// Computes element-wise population count (a.k.a. popcount, bitsum, bitcount).
+//
+// For each entry in `x`, calculates the number of `1` (on) bits in the binary
+// representation of that entry.
+//
+// **NOTE**: It is more efficient to first `tf.bitcast` your tensors into
+// `int32` or `int64` and perform the bitcount on the result, than to feed in
+// 8- or 16-bit inputs and then aggregate the resulting counts.
+func PopulationCount(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorSliceDataset",
+		Type: "PopulationCount",
 		Input: []tf.Input{
-			indices, values, dense_shape,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FractionalAvgPoolAttr is an optional argument to FractionalAvgPool.
-type FractionalAvgPoolAttr func(optionalAttr)
-
-// FractionalAvgPoolPseudoRandom sets the optional pseudo_random attribute to value.
-//
-// value: When set to True, generates the pooling sequence in a
-// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-// difference between pseudorandom and random.
-// If not specified, defaults to false
-func FractionalAvgPoolPseudoRandom(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["pseudo_random"] = value
-	}
-}
-
-// FractionalAvgPoolOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
-//
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [41/3, 26/3] for fractional avg pooling.
-// If not specified, defaults to false
-func FractionalAvgPoolOverlapping(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
-	}
-}
-
-// FractionalAvgPoolDeterministic sets the optional deterministic attribute to value.
-//
-// value: When set to True, a fixed pooling region will be used when
-// iterating over a FractionalAvgPool node in the computation graph. Mainly used
-// in unit test to make FractionalAvgPool deterministic.
-// If not specified, defaults to false
-func FractionalAvgPoolDeterministic(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["deterministic"] = value
-	}
-}
-
-// FractionalAvgPoolSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func FractionalAvgPoolSeed(value int64) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// FractionalAvgPoolSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func FractionalAvgPoolSeed2(value int64) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Performs fractional average pooling on the input.
-//
-// Fractional average pooling is similar to Fractional max pooling in the pooling
-// region generation step. The only difference is that after pooling regions are
-// generated, a mean operation is performed instead of a max operation in each
-// pooling region.
+// Delete the tensor specified by its handle in the session.
 //
 // Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
-// supports row and col dimension and should be >= 1.0. For example, a valid
-// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-// must be 1.0 because we don't allow pooling on batch and channels
-// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-// respectively.
-//
-// Returns output tensor after fractional avg pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
-func FractionalAvgPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalAvgPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FractionalAvgPool",
-		Input: []tf.Input{
-			value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Gradient op for `MirrorPad` op. This op folds a mirror-padded tensor.
-//
-// This operation folds the padded areas of `input` by `MirrorPad` according to the
-// `paddings` you specify. `paddings` must be the same as `paddings` argument
-// given to the corresponding `MirrorPad` op.
-//
-// The folded size of each dimension D of the output is:
-//
-// `input.dim_size(D) - paddings(D, 0) - paddings(D, 1)`
-//
-// For example:
-//
-// ```
-// # 't' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].
-// # 'paddings' is [[0, 1]], [0, 1]].
-// # 'mode' is SYMMETRIC.
-// # rank of 't' is 2.
-// pad(t, paddings) ==> [[ 1,  5]
-//                       [11, 28]]
-// ```
-//
-// Arguments:
-//	input: The input tensor to be folded.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	mode: The mode used in the `MirrorPad` op.
-//
-// Returns The folded tensor.
-func MirrorPadGrad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"mode": mode}
-	opspec := tf.OpSpec{
-		Type: "MirrorPadGrad",
-		Input: []tf.Input{
-			input, paddings,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StaticRegexReplaceAttr is an optional argument to StaticRegexReplace.
-type StaticRegexReplaceAttr func(optionalAttr)
-
-// StaticRegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
-//
-// value: If True, the replacement is global, otherwise the replacement
-// is done only on the first match.
-// If not specified, defaults to true
-func StaticRegexReplaceReplaceGlobal(value bool) StaticRegexReplaceAttr {
-	return func(m optionalAttr) {
-		m["replace_global"] = value
-	}
-}
-
-// Replaces the match of pattern in input with rewrite.
-//
-// It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
-//
-// Arguments:
-//	input: The text to be processed.
-//	pattern: The regular expression to match the input.
-//	rewrite: The rewrite to be applied to the matched expression.
-//
-// Returns The text after applying pattern and rewrite.
-func StaticRegexReplace(scope *Scope, input tf.Output, pattern string, rewrite string, optional ...StaticRegexReplaceAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"pattern": pattern, "rewrite": rewrite}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StaticRegexReplace",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Generates sparse cross from a list of sparse and dense tensors.
-//
-// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
-// representing features of one feature column. It outputs a 2D `SparseTensor` with
-// the batchwise crosses of these features.
-//
-// For example, if the inputs are
-//
-//     inputs[0]: SparseTensor with shape = [2, 2]
-//     [0, 0]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-//     inputs[1]: SparseTensor with shape = [2, 1]
-//     [0, 0]: "d"
-//     [1, 0]: "e"
-//
-//     inputs[2]: Tensor [["f"], ["g"]]
-//
-// then the output will be
-//
-//     shape = [2, 2]
-//     [0, 0]: "a_X_d_X_f"
-//     [1, 0]: "b_X_e_X_g"
-//     [1, 1]: "c_X_e_X_g"
-//
-// if hashed_output=true then the output will be
-//
-//     shape = [2, 2]
-//     [0, 0]: FingerprintCat64(
-//                 Fingerprint64("f"), FingerprintCat64(
-//                     Fingerprint64("d"), Fingerprint64("a")))
-//     [1, 0]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("b")))
-//     [1, 1]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("c")))
-//
-// Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.   values of each `SparseTensor`.
-//	shapes: 1-D.   Shapes of each `SparseTensor`.
-//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
-//	hashed_output: If true, returns the hash of the cross instead of the string.
-// This will allow us avoiding string manipulations.
-//	num_buckets: It is used if hashed_output is true.
-// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
-//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
-// function to combine the crosses fingerprints.
-//
-//
-//
-// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated or hashed
-// `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
-func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
-	opspec := tf.OpSpec{
-		Type: "SparseCross",
-		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// RandomUniformAttr is an optional argument to RandomUniform.
-type RandomUniformAttr func(optionalAttr)
-
-// RandomUniformSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformSeed(value int64) RandomUniformAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomUniformSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformSeed2(value int64) RandomUniformAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
-//
-// Returns A tensor of the specified shape filled with uniform random values.
-func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomUniform",
-		Input: []tf.Input{
-			shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that concatenates `input_dataset` with `another_dataset`.
-func ConcatenateDataset(scope *Scope, input_dataset tf.Output, another_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ConcatenateDataset",
-		Input: []tf.Input{
-			input_dataset, another_dataset,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Reduces sparse updates into the variable referenced by `resource` using the `max` operation.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] = max(ref[indices, ...], updates[...])
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] = max(ref[indices[i], ...], updates[i, ...])
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] = max(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions are combined.
-//
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
-//
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
+//	handle: The handle for a tensor stored in the session state.
 //
 // Returns the created operation.
-func ResourceScatterMax(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+func DeleteSessionTensor(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterMax",
+		Type: "DeleteSessionTensor",
 		Input: []tf.Input{
-			resource, indices, updates,
+			handle,
 		},
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
-//
-// The upper regularized incomplete Gamma function is defined as:
-//
-// \\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
-//
-// where
-//
-// \\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
-//
-// is the upper incomplete Gama function.
-//
-// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
-// Gamma function.
-func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Igammac",
-		Input: []tf.Input{
-			a, x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
-type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
-
-// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// DepthwiseConv2dNativeBackpropFilterDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of depthwise convolution with respect to the filter.
-//
-// Arguments:
-//	input: 4-D with shape based on `data_format`.  For example, if
-// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
-// in_width, in_channels]` tensor.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropFilter",
-		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Split a `SparseTensor` into `num_split` tensors along one dimension.
-//
-// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
-// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
-// For example, if `split_dim = 1` and `num_split = 2` and the input is
-//
-//     input_tensor = shape = [2, 7]
-//     [    a   d e  ]
-//     [b c          ]
-//
-// Graphically the output tensors are:
-//
-//     output_tensor[0] = shape = [2, 4]
-//     [    a  ]
-//     [b c    ]
-//
-//     output_tensor[1] = shape = [2, 3]
-//     [ d e  ]
-//     [      ]
-//
-// Arguments:
-//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
-// `[0, rank(shape))`.
-//	indices: 2-D tensor represents the indices of the sparse tensor.
-//	values: 1-D tensor represents the values of the sparse tensor.
-//	shape: 1-D. tensor represents the shape of the sparse tensor.
-// output indices: A list of 1-D tensors represents the indices of the output
-// sparse tensors.
-//	num_split: The number of ways to split.
-//
-// Returns A list of 1-D tensors represents the values of the output sparse
-// tensors.A list of 1-D tensors represents the shape of the output sparse
-// tensors.
-func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_split": num_split}
-	opspec := tf.OpSpec{
-		Type: "SparseSplit",
-		Input: []tf.Input{
-			split_dim, indices, values, shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	return output_indices, output_values, output_shape
-}
-
-// CropAndResizeGradBoxesAttr is an optional argument to CropAndResizeGradBoxes.
-type CropAndResizeGradBoxesAttr func(optionalAttr)
-
-// CropAndResizeGradBoxesMethod sets the optional method attribute to value.
-//
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeGradBoxesMethod(value string) CropAndResizeGradBoxesAttr {
-	return func(m optionalAttr) {
-		m["method"] = value
-	}
-}
-
-// Computes the gradient of the crop_and_resize op wrt the input boxes tensor.
-//
-// Arguments:
-//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-// Both `image_height` and `image_width` need to be positive.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-//
-// Returns A 2-D tensor of shape `[num_boxes, 4]`.
-func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxes tf.Output, box_ind tf.Output, optional ...CropAndResizeGradBoxesAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CropAndResizeGradBoxes",
-		Input: []tf.Input{
-			grads, image, boxes, box_ind,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Transforms a vector of brain.Example protos (as strings) into typed tensors.
-//
-// Arguments:
-//	serialized: A vector containing a batch of binary serialized Example protos.
-//	names: A vector containing the names of the serialized protos.
-// May contain, for example, table key (descriptive) names for the
-// corresponding serialized protos.  These are purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty vector if no names are available.
-// If non-empty, this vector must be the same length as "serialized".
-//	sparse_keys: A list of Nsparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: A list of Ndense string Tensors (scalars).
-// The keys expected in the Examples' features associated with dense values.
-//	dense_defaults: A list of Ndense Tensors (some may be empty).
-// dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	sparse_types: A list of Nsparse types; the data types of data in each Feature
-// given in sparse_keys.
-// Currently the ParseExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
-// given in dense_keys.
-// The number of elements in the Feature corresponding to dense_key[j]
-// must always equal dense_shapes[j].NumEntries().
-// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
-// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
-// The dense outputs are just the inputs row-stacked by batch.
-// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
-// the shape of the output Tensor dense_values[j] will be
-// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
-// of elements of length D1 * .... * DN, across all minibatch entries
-// in the input.  Any minibatch entry with less than M blocks of elements of
-// length D1 * ... * DN will be padded with the corresponding default_value
-// scalar element along the second dimension.
-func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
-	opspec := tf.OpSpec{
-		Type: "ParseExample",
-		Input: []tf.Input{
-			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values
-}
-
-// UnicodeDecodeAttr is an optional argument to UnicodeDecode.
-type UnicodeDecodeAttr func(optionalAttr)
-
-// UnicodeDecodeErrors sets the optional errors attribute to value.
-//
-// value: Error handling policy when there is invalid formatting found in the input.
-// The value of 'strict' will cause the operation to produce a InvalidArgument
-// error on any invalid input formatting. A value of 'replace' (the default) will
-// cause the operation to replace any invalid formatting in the input with the
-// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
-// skip any invalid formatting in the input and produce no corresponding output
-// character.
-// If not specified, defaults to "replace"
-func UnicodeDecodeErrors(value string) UnicodeDecodeAttr {
-	return func(m optionalAttr) {
-		m["errors"] = value
-	}
-}
-
-// UnicodeDecodeReplacementChar sets the optional replacement_char attribute to value.
-//
-// value: The replacement character codepoint to be used in place of any invalid
-// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
-// be used. The default value is the default unicode replacement character is
-// 0xFFFD or U+65533.)
-// If not specified, defaults to 65533
-func UnicodeDecodeReplacementChar(value int64) UnicodeDecodeAttr {
-	return func(m optionalAttr) {
-		m["replacement_char"] = value
-	}
-}
-
-// UnicodeDecodeReplaceControlCharacters sets the optional replace_control_characters attribute to value.
-//
-// value: Whether to replace the C0 control characters (00-1F) with the
-// `replacement_char`. Default is false.
-// If not specified, defaults to false
-func UnicodeDecodeReplaceControlCharacters(value bool) UnicodeDecodeAttr {
-	return func(m optionalAttr) {
-		m["replace_control_characters"] = value
-	}
-}
-
-// UnicodeDecodeTsplits sets the optional Tsplits attribute to value.
-// If not specified, defaults to DT_INT64
-func UnicodeDecodeTsplits(value tf.DataType) UnicodeDecodeAttr {
-	return func(m optionalAttr) {
-		m["Tsplits"] = value
-	}
-}
-
-// Decodes each string in `input` into a sequence of Unicode code points.
-//
-// The character codepoints for all strings are returned using a single vector
-// `char_values`, with strings expanded to characters in row-major order.
-//
-// The `row_splits` tensor indicates where the codepoints for
-// each input string begin and end within the `char_values` tensor.
-// In particular, the values for the `i`th
-// string (in row-major order) are stored in the slice
-// `[row_splits[i]:row_splits[i+1]]`. Thus:
-//
-// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
-//   character in the `i`th string (in row-major order).
-// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
-//   string (in row-major order).
-//
-// Arguments:
-//	input: The text to be decoded. Can have any shape. Note that the output is flattened
-// to a vector of char values.
-//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
-// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
-//
-// Returns A 1D int32 tensor containing the row splits.A 1D int32 Tensor containing the decoded codepoints.
-func UnicodeDecode(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeAttr) (row_splits tf.Output, char_values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"input_encoding": input_encoding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UnicodeDecode",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Conv2DBackpropInputAttr is an optional argument to Conv2DBackpropInput.
-type Conv2DBackpropInputAttr func(optionalAttr)
-
-// Conv2DBackpropInputUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
-	}
-}
-
-// Conv2DBackpropInputExplicitPaddings sets the optional explicit_paddings attribute to value.
-//
-// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
-// dimension, the amount of padding inserted before and after the dimension is
-// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
-// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
-// If not specified, defaults to <>
-func Conv2DBackpropInputExplicitPaddings(value []int64) Conv2DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["explicit_paddings"] = value
-	}
-}
-
-// Conv2DBackpropInputDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv2DBackpropInputDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of convolution with respect to the input.
-//
-// Arguments:
-//	input_sizes: An integer vector representing the shape of `input`,
-// where `input` is a 4-D `[batch, height, width, channels]` tensor.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution. Must be in the same order as the dimension specified with
-// format.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
-// w.r.t. the input of the convolution.
-func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropInputAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv2DBackpropInput",
-		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes softmax cross entropy cost and gradients to backpropagate.
-//
-// Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
-// a matrix of label probabilities, but rather a single label per row
-// of features.  This label is considered to have probability 1.0 for the
-// given row.
-//
-// Inputs are the logits, not probabilities.
-//
-// Arguments:
-//	features: batch_size x num_classes matrix
-//	labels: batch_size vector with values in [0, num_classes).
-// This is the label for the given minibatch entry.
-//
-// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
-func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSoftmaxCrossEntropyWithLogits",
-		Input: []tf.Input{
-			features, labels,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Returns element-wise remainder of division. This emulates C semantics in that
-//
-// the result here is consistent with a truncating divide. E.g.
-// `tf.truncatediv(x, y) * y + truncate_mod(x, y) = x`.
-//
-// *NOTE*: `Mod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Mod",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the next record (key, value pair) produced by a Reader.
+// Returns up to `num_records` (key, value) pairs produced by a Reader.
 //
 // Will dequeue from the input queue if necessary (e.g. when the
 // Reader needs to start reading from a new file since it has finished
 // with the previous file).
+// It may return less than `num_records` even before the last batch.
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
-//	queue_handle: Handle to a Queue, with string work items.
+//	reader_handle: Handle to a `Reader`.
+//	queue_handle: Handle to a `Queue`, with string work items.
+//	num_records: number of records to read from `Reader`.
 //
-// Returns A scalar.A scalar.
-func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
+// Returns A 1-D tensor.A 1-D tensor.
+func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output, num_records tf.Output) (keys tf.Output, values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderReadV2",
+		Type: "ReaderReadUpToV2",
 		Input: []tf.Input{
-			reader_handle, queue_handle,
+			reader_handle, queue_handle, num_records,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1)
 }
 
-// QuantizedReluXAttr is an optional argument to QuantizedReluX.
-type QuantizedReluXAttr func(optionalAttr)
-
-// QuantizedReluXOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
-//
-// Arguments:
-//
-//
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
-//
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+// Returns element-wise largest integer not greater than x.
+func Floor(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedReluX",
+		Type: "Floor",
 		Input: []tf.Input{
-			features, max_value, min_features, max_features,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Output a fact about factorials.
-func Fact(scope *Scope) (fact tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Fact",
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// 2D fast Fourier transform.
-//
-// Computes the 2-dimensional discrete Fourier transform over the inner-most
-// 2 dimensions of `input`.
-//
-// Arguments:
-//	input: A complex tensor.
-//
-// Returns A complex tensor of the same shape as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their 2D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.fft2
-// @end_compatibility
-func FFT2D(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "FFT2D",
-		Input: []tf.Input{
-			input,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyProximalGradientDescentAttr is an optional argument to ResourceSparseApplyProximalGradientDescent.
-type ResourceSparseApplyProximalGradientDescentAttr func(optionalAttr)
+// ResizeBilinearGradAttr is an optional argument to ResizeBilinearGrad.
+type ResizeBilinearGradAttr func(optionalAttr)
 
-// ResourceSparseApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+// ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If True, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
 // If not specified, defaults to false
-func ResourceSparseApplyProximalGradientDescentUseLocking(value bool) ResourceSparseApplyProximalGradientDescentAttr {
+func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["align_corners"] = value
 	}
 }
 
-// Sparse update '*var' as FOBOS algorithm with fixed learning rate.
-//
-// That is for rows we have grad for, we update var as follows:
-// prox_v = var - alpha * grad
-// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//
-// Returns the created operation.
-func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalGradientDescentAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyProximalGradientDescent",
-		Input: []tf.Input{
-			var_, alpha, l1, l2, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes the maximum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// This operator is similar to the unsorted segment sum operator found
-// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-// Instead of computing the sum over segments, it computes the maximum such that:
-//
-// \\(output_i = \max_{j...} data[j...]\\) where max is over tuples `j...` such
-// that `segment_ids[j...] == i`.
-//
-// If the maximum is empty for a given segment ID `i`, it outputs the smallest
-// possible value for the specific numeric type,
-// `output[i] = numeric_limits<T>::lowest()`.
-//
-// If the given segment ID `i` is negative, then the corresponding value is
-// dropped, and will not be included in the result.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
-// </div>
-//
-// For example:
-//
-// ``` python
-// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
-// tf.unsorted_segment_max(c, tf.constant([0, 1, 0]), num_segments=2)
-// # ==> [[ 4,  3, 3, 4],
-// #       [5,  6, 7, 8]]
-// ```
-//
-//
-// Arguments:
-//
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
-//
-//
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentMax",
-		Input: []tf.Input{
-			data, segment_ids, num_segments,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// An Op to sum inputs across replicated TPU instances.
-//
-// Each instance supplies its own input.
-//
-// For example, suppose there are 8 TPU instances: `[A, B, C, D, E, F, G, H]`.
-// Passing group_assignment=`[[0,2,4,6],[1,3,5,7]]` sets `A, C, E, G` as group 0,
-// and `B, D, F, H` as group 1. Thus we get the outputs:
-// `[A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H]`.
-//
-// Arguments:
-//	input: The local input to the sum.
-//	group_assignment: An int32 tensor with shape
-// [num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
-// replica ids in the ith subgroup.
-//
-// Returns The sum of all the distributed inputs.
-func CrossReplicaSum(scope *Scope, input tf.Output, group_assignment tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "CrossReplicaSum",
-		Input: []tf.Input{
-			input, group_assignment,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizedDepthwiseConv2DAttr is an optional argument to QuantizedDepthwiseConv2D.
-type QuantizedDepthwiseConv2DAttr func(optionalAttr)
-
-// QuantizedDepthwiseConv2DOutType sets the optional out_type attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_QINT32
-func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2DAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
-//
-// value: List of dilation values.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes quantized depthwise Conv2D.
-//
-// Arguments:
-//	input: The original input tensor.
-//	filter: The original filter tensor.
-//	min_input: The float value that the minimum quantized input value represents.
-//	max_input: The float value that the maximum quantized input value represents.
-//	min_filter: The float value that the minimum quantized filter value represents.
-//	max_filter: The float value that the maximum quantized filter value represents.
-//	strides: List of stride values.
-//
-//
-// Returns The output tensor.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizedDepthwiseConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedDepthwiseConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedDepthwiseConv2D",
-		Input: []tf.Input{
-			input, filter, min_input, max_input, min_filter, max_filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// EncodeProtoAttr is an optional argument to EncodeProto.
-type EncodeProtoAttr func(optionalAttr)
-
-// EncodeProtoDescriptorSource sets the optional descriptor_source attribute to value.
-// If not specified, defaults to "local://"
-func EncodeProtoDescriptorSource(value string) EncodeProtoAttr {
-	return func(m optionalAttr) {
-		m["descriptor_source"] = value
-	}
-}
-
-// The op serializes protobuf messages provided in the input tensors.
-//
-// The types of the tensors in `values` must match the schema for the
-// fields specified in `field_names`. All the tensors in `values` must
-// have a common shape prefix, *batch_shape*.
-//
-// The `sizes` tensor specifies repeat counts for each field.  The repeat
-// count (last dimension) of a each tensor in `values` must be greater
-// than or equal to corresponding repeat count in `sizes`.
-//
-// A `message_type` name must be provided to give context for the field
-// names. The actual message descriptor can be looked up either in the
-// linked-in descriptor pool or a filename provided by the caller using
-// the `descriptor_source` attribute.
-//
-// The `descriptor_source` attribute selects a source of protocol
-// descriptors to consult when looking up `message_type`. This may be a
-// filename containing a serialized `FileDescriptorSet` message,
-// or the special value `local://`, in which case only descriptors linked
-// into the code will be searched; the filename can be on any filesystem
-// accessible to TensorFlow.
-//
-// You can build a `descriptor_source` file using the `--descriptor_set_out`
-// and `--include_imports` options to the protocol compiler `protoc`.
-//
-// The `local://` database only covers descriptors linked into the
-// code via C++ libraries, not Python imports. You can link in a proto descriptor
-// by creating a cc_library target with alwayslink=1.
-//
-// There are a few special cases in the value mapping:
-//
-// Submessage and group fields must be pre-serialized as TensorFlow strings.
-//
-// TensorFlow lacks support for unsigned int64s, so they must be
-// represented as `tf.int64` with the same twos-complement bit pattern
-// (the obvious way).
-//
-// Unsigned int32 values can be represented exactly with `tf.int64`, or
-// with sign wrapping if the input is of type `tf.int32`.
-//
-// Arguments:
-//	sizes: Tensor of int32 with shape `[batch_shape, len(field_names)]`.
-//	values: List of tensors containing values for the corresponding field.
-//	field_names: List of strings containing proto field names.
-//	message_type: Name of the proto message type to decode.
-//
-// Returns Tensor of serialized protos with shape `batch_shape`.
-func EncodeProto(scope *Scope, sizes tf.Output, values []tf.Output, field_names []string, message_type string, optional ...EncodeProtoAttr) (bytes tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"field_names": field_names, "message_type": message_type}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EncodeProto",
-		Input: []tf.Input{
-			sizes, tf.OutputList(values),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the truth value of x OR y element-wise.
-//
-// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LogicalOr",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Connects outputs of an N-way replicated computation to N outputs.
-func TPUReplicatedOutput(scope *Scope, input tf.Output, num_replicas int64) (outputs []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_replicas": num_replicas}
-	opspec := tf.OpSpec{
-		Type: "TPUReplicatedOutput",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("TPUReplicatedOutput", err)
-		return
-	}
-	return outputs
-}
-
-// Conv3DBackpropFilterV2Attr is an optional argument to Conv3DBackpropFilterV2.
-type Conv3DBackpropFilterV2Attr func(optionalAttr)
-
-// Conv3DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv3DBackpropFilterV2Dilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of 3-D convolution with respect to the filter.
-//
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 5-D
-// `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
-// tensor.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropFilterV2",
-		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that passes a sliding window over `input_dataset`.
-//
-// Arguments:
-//
-//	window_size: A scalar representing the number of elements in the
-// sliding window.
-//	window_shift: A scalar representing the steps moving the sliding window
-// forward in one iteration. It must be positive.
-//	window_stride: A scalar representing the stride of the input elements of the sliding window.
-// It must be positive.
-//
-//
-func ExperimentalSlidingWindowDataset(scope *Scope, input_dataset tf.Output, window_size tf.Output, window_shift tf.Output, window_stride tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalSlidingWindowDataset",
-		Input: []tf.Input{
-			input_dataset, window_size, window_shift, window_stride,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StringSplitV2Attr is an optional argument to StringSplitV2.
-type StringSplitV2Attr func(optionalAttr)
-
-// StringSplitV2Maxsplit sets the optional maxsplit attribute to value.
-//
-// value: An `int`. If `maxsplit > 0`, limit of the split of the result.
-// If not specified, defaults to -1
-func StringSplitV2Maxsplit(value int64) StringSplitV2Attr {
-	return func(m optionalAttr) {
-		m["maxsplit"] = value
-	}
-}
-
-// Split elements of `source` based on `sep` into a `SparseTensor`.
-//
-// Let N be the size of source (typically N will be the batch size). Split each
-// element of `source` based on `sep` and return a `SparseTensor`
-// containing the split tokens. Empty tokens are ignored.
-//
-// For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
-// then the output will be
-// ```
-// st.indices = [0, 0;
-//               0, 1;
-//               1, 0;
-//               1, 1;
-//               1, 2]
-// st.shape = [2, 3]
-// st.values = ['hello', 'world', 'a', 'b', 'c']
-// ```
-//
-// If `sep` is given, consecutive delimiters are not grouped together and are
-// deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
-// sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
-// string, consecutive whitespace are regarded as a single separator, and the
-// result will contain no empty strings at the startor end if the string has
-// leading or trailing whitespace.
-//
-// Note that the above mentioned behavior matches python's str.split.
-//
-// Arguments:
-//	input: `1-D` string `Tensor`, the strings to split.
-//	sep: `0-D` string `Tensor`, the delimiter character.
-func StringSplitV2(scope *Scope, input tf.Output, sep tf.Output, optional ...StringSplitV2Attr) (indices tf.Output, values tf.Output, shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StringSplitV2",
-		Input: []tf.Input{
-			input, sep,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingRMSPropParametersGradAccumDebug.
-type LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr func(optionalAttr)
-
-// LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load RMSProp embedding parameters with debug support.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the RMSProp optimization algorithm.
-//	ms: Value of ms used in the RMSProp optimization algorithm.
-//	mom: Value of mom used in the RMSProp optimization algorithm.
-//	gradient_accumulators: Value of gradient_accumulators used in the RMSProp optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingRMSPropParametersGradAccumDebug(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingRMSPropParametersGradAccumDebug",
-		Input: []tf.Input{
-			parameters, ms, mom, gradient_accumulators,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// LRNGradAttr is an optional argument to LRNGrad.
-type LRNGradAttr func(optionalAttr)
-
-// LRNGradDepthRadius sets the optional depth_radius attribute to value.
-//
-// value: A depth radius.
-// If not specified, defaults to 5
-func LRNGradDepthRadius(value int64) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["depth_radius"] = value
-	}
-}
-
-// LRNGradBias sets the optional bias attribute to value.
-//
-// value: An offset (usually > 0 to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNGradBias(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["bias"] = value
-	}
-}
-
-// LRNGradAlpha sets the optional alpha attribute to value.
-//
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNGradAlpha(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
-	}
-}
-
-// LRNGradBeta sets the optional beta attribute to value.
-//
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNGradBeta(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["beta"] = value
-	}
-}
-
-// Gradients for Local Response Normalization.
-//
-// Arguments:
-//	input_grads: 4-D with shape `[batch, height, width, channels]`.
-//	input_image: 4-D with shape `[batch, height, width, channels]`.
-//	output_image: 4-D with shape `[batch, height, width, channels]`.
-//
-// Returns The gradients for LRN.
-func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LRNGrad",
-		Input: []tf.Input{
-			input_grads, input_image, output_image,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// GatherAttr is an optional argument to Gather.
-type GatherAttr func(optionalAttr)
-
-// GatherValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func GatherValidateIndices(value bool) GatherAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Gather slices from `params` according to `indices`.
-//
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
-//
-// ```python
-//     # Scalar indices
-//     output[:, ..., :] = params[indices, :, ... :]
-//
-//     # Vector indices
-//     output[i, :, ..., :] = params[indices[i], :, ... :]
-//
-//     # Higher rank indices
-//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
-// ```
-//
-// If `indices` is a permutation and `len(indices) == params.shape[0]` then
-// this operation will permute `params` accordingly.
-//
-// `validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in
-// `indices` are always validated to be within range. If assigned to GPU,
-// out-of-bound indices result in safe but unspecified behavior, which may include
-// raising an error.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
-// </div>
-func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...GatherAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Gather",
-		Input: []tf.Input{
-			params, indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Concatenates a list of `SparseTensor` along the specified dimension.
-//
-// Concatenation is with respect to the dense versions of these sparse tensors.
-// It is assumed that each input is a `SparseTensor` whose elements are ordered
-// along increasing dimension number.
-//
-// All inputs' shapes must match, except for the concat dimension.  The
-// `indices`, `values`, and `shapes` lists must have the same length.
-//
-// The output shape is identical to the inputs', except along the concat
-// dimension, where it is the sum of the inputs' sizes along that dimension.
-//
-// The output elements will be resorted to preserve the sort order along
-// increasing dimension number.
-//
-// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
-// values across all inputs. This is due to the need for an internal sort in
-// order to concatenate efficiently across an arbitrary dimension.
-//
-// For example, if `concat_dim = 1` and the inputs are
-//
-//     sp_inputs[0]: shape = [2, 3]
-//     [0, 2]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-//     sp_inputs[1]: shape = [2, 4]
-//     [0, 1]: "d"
-//     [0, 2]: "e"
-//
-// then the output will be
-//
-//     shape = [2, 7]
-//     [0, 2]: "a"
-//     [0, 4]: "d"
-//     [0, 5]: "e"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-// Graphically this is equivalent to doing
-//
-//     [    a] concat [  d e  ] = [    a   d e  ]
-//     [b c  ]        [       ]   [b c          ]
-//
-// Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.  Non-empty values of each `SparseTensor`.
-//	shapes: 1-D.  Shapes of each `SparseTensor`.
-//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
-// where rank is the number of dimensions in each input `SparseTensor`.
-//
-// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
-func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"concat_dim": concat_dim}
-	opspec := tf.OpSpec{
-		Type: "SparseConcat",
-		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Adjust the hue of one or more images.
-//
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpretted as channels, and must be three.
-//
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A delta is then applied all the hue values,
-// and then remapped back to RGB colorspace.
-//
-// Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	delta: A float delta to add to the hue.
-//
-// Returns The hue-adjusted image or images.
-func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AdjustHue",
-		Input: []tf.Input{
-			images, delta,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// InfeedEnqueueAttr is an optional argument to InfeedEnqueue.
-type InfeedEnqueueAttr func(optionalAttr)
-
-// InfeedEnqueueShape sets the optional shape attribute to value.
-//
-// value: The shape of the tensor.
-// If not specified, defaults to <>
-func InfeedEnqueueShape(value tf.Shape) InfeedEnqueueAttr {
-	return func(m optionalAttr) {
-		m["shape"] = value
-	}
-}
-
-// InfeedEnqueueLayout sets the optional layout attribute to value.
-//
-// value: A vector holding the requested layout in minor-to-major sequence.
-// If a layout attribute is passed, but its values are all -1, the layout will
-// be computed by the infeed operation.
-// If not specified, defaults to <>
-func InfeedEnqueueLayout(value []int64) InfeedEnqueueAttr {
-	return func(m optionalAttr) {
-		m["layout"] = value
-	}
-}
-
-// InfeedEnqueueDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. This should be -1 when the Op
-// is running on a TPU device, and >= 0 when the Op is running on the CPU
-// device.
-// If not specified, defaults to -1
-func InfeedEnqueueDeviceOrdinal(value int64) InfeedEnqueueAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// An op which feeds a single Tensor value into the computation.
-//
-// Arguments:
-//	input: A tensor that will be provided using the infeed mechanism.
-//
-// Returns the created operation.
-func InfeedEnqueue(scope *Scope, input tf.Output, optional ...InfeedEnqueueAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "InfeedEnqueue",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns a list list which has the passed-in `Tensor` as last element and the other elements of the given list in `input_handle`.
-//
-// tensor: The tensor to put on the list.
-// input_handle: The old list.
-// output_handle: A list with the elements of the old list followed by tensor.
-// element_dtype: the type of elements in the list.
-// element_shape: a shape compatible with that of elements in the list.
-func TensorListPushBack(scope *Scope, input_handle tf.Output, tensor tf.Output) (output_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListPushBack",
-		Input: []tf.Input{
-			input_handle, tensor,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MaxPoolGradGradWithArgmaxAttr is an optional argument to MaxPoolGradGradWithArgmax.
-type MaxPoolGradGradWithArgmaxAttr func(optionalAttr)
-
-// MaxPoolGradGradWithArgmaxIncludeBatchInIndex sets the optional include_batch_in_index attribute to value.
-//
-// value: Whether to include batch dimension in flattened index of `argmax`.
+// ResizeBilinearGradHalfPixelCenters sets the optional half_pixel_centers attribute to value.
 // If not specified, defaults to false
-func MaxPoolGradGradWithArgmaxIncludeBatchInIndex(value bool) MaxPoolGradGradWithArgmaxAttr {
+func ResizeBilinearGradHalfPixelCenters(value bool) ResizeBilinearGradAttr {
 	return func(m optionalAttr) {
-		m["include_batch_in_index"] = value
+		m["half_pixel_centers"] = value
 	}
 }
 
-// Computes second-order gradients of the maxpooling function.
+// Computes the gradient of bilinear interpolation.
 //
 // Arguments:
-//	input: The original input.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
-// input of `max_pool`.
-//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
+// The image tensor that was resized.
 //
-// Returns Gradients of gradients w.r.t. the input of `max_pool`.
-func MaxPoolGradGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradWithArgmaxAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGradWithArgmax",
-		Input: []tf.Input{
-			input, grad, argmax,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyGradientDescentAttr is an optional argument to ResourceApplyGradientDescent.
-type ResourceApplyGradientDescentAttr func(optionalAttr)
-
-// ResourceApplyGradientDescentUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyGradientDescentUseLocking(value bool) ResourceApplyGradientDescentAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' by subtracting 'alpha' * 'delta' from it.
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	delta: The change.
-//
-// Returns the created operation.
-func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, optional ...ResourceApplyGradientDescentAttr) (o *tf.Operation) {
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
+// Gradients with respect to the input image. Input image must have been
+// float or double.
+func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBilinearGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -21335,66 +38084,9 @@ func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyGradientDescent",
+		Type: "ResizeBilinearGrad",
 		Input: []tf.Input{
-			var_, alpha, delta,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
-type FusedResizeAndPadConv2DAttr func(optionalAttr)
-
-// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
-	return func(m optionalAttr) {
-		m["resize_align_corners"] = value
-	}
-}
-
-// Performs a resize and padding as a preprocess during a convolution.
-//
-// It's often possible to do spatial transformations more efficiently as part of
-// the packing stage of a convolution, so this op allows for an optimized
-// implementation where these stages are fused together. This prevents the need to
-// write out the intermediate results as whole tensors, reducing memory pressure,
-// and we can get some latency gains by merging the transformation calculations.
-// The data_format attribute for Conv2D isn't supported by this op, and defaults to
-// 'NHWC' order.
-// Internally this op uses a single per-graph scratch buffer, which means that it
-// will block if multiple versions are being run in parallel. This is because this
-// operator is primarily an optimization to minimize memory usage.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
-//
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`. Must be in the same order as the dimension specified with format.
-//	padding: The type of padding algorithm to use.
-func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FusedResizeAndPadConv2D",
-		Input: []tf.Input{
-			input, size, paddings, filter,
+			grads, original_image,
 		},
 		Attrs: attrs,
 	}
@@ -21402,556 +38094,6 @@ func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, padd
 	return op.Output(0)
 }
 
-// Computes the derivative of a Gamma random sample w.r.t. `alpha`.
-func RandomGammaGrad(scope *Scope, alpha tf.Output, sample tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomGammaGrad",
-		Input: []tf.Input{
-			alpha, sample,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Debugging/model interpretability outputs for each example.
-//
-// It traverses all the trees and computes debug metrics for individual examples,
-// such as getting split feature ids and logits after each split along the decision
-// path used to compute directional feature contributions.
-//
-// Arguments:
-//
-//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
-// feature.
-//	logits_dimension: scalar, dimension of the logits, to be used for constructing the protos in
-// examples_debug_outputs_serialized.
-//
-// Returns Output rank 1 Tensor containing a proto serialized as a string for each example.
-func BoostedTreesExampleDebugOutputs(scope *Scope, tree_ensemble_handle tf.Output, bucketized_features []tf.Output, logits_dimension int64) (examples_debug_outputs_serialized tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesExampleDebugOutputs",
-		Input: []tf.Input{
-			tree_ensemble_handle, tf.OutputList(bucketized_features),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingADAMParametersAttr is an optional argument to RetrieveTPUEmbeddingADAMParameters.
-type RetrieveTPUEmbeddingADAMParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingADAMParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingADAMParametersTableId(value int64) RetrieveTPUEmbeddingADAMParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingADAMParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingADAMParametersTableName(value string) RetrieveTPUEmbeddingADAMParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve ADAM embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the ADAM optimization algorithm.Parameter momenta updated by the ADAM optimization algorithm.Parameter velocities updated by the ADAM optimization algorithm.
-func RetrieveTPUEmbeddingADAMParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingADAMParametersAttr) (parameters tf.Output, momenta tf.Output, velocities tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingADAMParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Computes the gradient of `igamma(a, x)` wrt `a`.
-func IgammaGradA(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IgammaGradA",
-		Input: []tf.Input{
-			a, x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// PrelinearizeTupleAttr is an optional argument to PrelinearizeTuple.
-type PrelinearizeTupleAttr func(optionalAttr)
-
-// PrelinearizeTupleLayouts sets the optional layouts attribute to value.
-//
-// value: A vector holding the requested layout in minor-to-major sequence for all the
-// tuple shapes in the order the shapes appear in the "shapes" input. The layout
-// elements for a sub-shape can be set to -1 in which case the corresponding layout
-// will be computed by the infeed operation.
-// If not specified, defaults to <>
-func PrelinearizeTupleLayouts(value []int64) PrelinearizeTupleAttr {
-	return func(m optionalAttr) {
-		m["layouts"] = value
-	}
-}
-
-// An op which linearizes multiple Tensor values to an opaque variant tensor.
-//
-// Arguments:
-//	inputs: A list of tensors that will be provided using the infeed mechanism.
-//	shapes: The shapes of each tensor in `inputs`.
-func PrelinearizeTuple(scope *Scope, inputs []tf.Output, shapes []tf.Shape, optional ...PrelinearizeTupleAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shapes": shapes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "PrelinearizeTuple",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
-type ResourceSparseApplyRMSPropAttr func(optionalAttr)
-
-// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the RMSProp algorithm.
-//
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-//
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
-//
-// Returns the created operation.
-func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyRMSProp",
-		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// MapPeekAttr is an optional argument to MapPeek.
-type MapPeekAttr func(optionalAttr)
-
-// MapPeekCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapPeekCapacity(value int64) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapPeekMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapPeekMemoryLimit(value int64) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapPeekContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapPeekContainer(value string) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapPeekSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapPeekSharedName(value string) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op peeks at the values at the specified key.  If the
-//
-// underlying container does not contain this key
-// this op will block until it does.
-func MapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapPeekAttr) (values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapPeek",
-		Input: []tf.Input{
-			key, indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapPeek", err)
-		return
-	}
-	return values
-}
-
-// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
-type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
-
-// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// var: Should be from a Variable().
-//
-// Arguments:
-//
-//	accum: Should be from a Variable().
-//	accum_update: : Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//
-// Returns the created operation.
-func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdadelta",
-		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Real-valued fast Fourier transform.
-//
-// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most dimension of `input`.
-//
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
-// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
-// followed by the `fft_length / 2` positive-frequency terms.
-//
-// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
-//
-// Returns A complex64 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
-//   frequency components of its 1D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.rfft
-// @end_compatibility
-func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RFFT",
-		Input: []tf.Input{
-			input, fft_length,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LRNAttr is an optional argument to LRN.
-type LRNAttr func(optionalAttr)
-
-// LRNDepthRadius sets the optional depth_radius attribute to value.
-//
-// value: 0-D.  Half-width of the 1-D normalization window.
-// If not specified, defaults to 5
-func LRNDepthRadius(value int64) LRNAttr {
-	return func(m optionalAttr) {
-		m["depth_radius"] = value
-	}
-}
-
-// LRNBias sets the optional bias attribute to value.
-//
-// value: An offset (usually positive to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNBias(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["bias"] = value
-	}
-}
-
-// LRNAlpha sets the optional alpha attribute to value.
-//
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNAlpha(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
-	}
-}
-
-// LRNBeta sets the optional beta attribute to value.
-//
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNBeta(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["beta"] = value
-	}
-}
-
-// Local Response Normalization.
-//
-// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
-// dimension), and each vector is normalized independently.  Within a given vector,
-// each component is divided by the weighted, squared sum of inputs within
-// `depth_radius`.  In detail,
-//
-//     sqr_sum[a, b, c, d] =
-//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
-//     output = input / (bias + alpha * sqr_sum) ** beta
-//
-// For details, see [Krizhevsky et al., ImageNet classification with deep
-// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
-//
-// Arguments:
-//	input: 4-D.
-func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LRN",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
-type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
-
-// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, mg, ms, and mom tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the centered RMSProp algorithm.
-//
-// The centered RMSProp algorithm uses an estimate of the centered second moment
-// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
-// uses the (uncentered) second moment. This often helps with training, but is
-// slightly more expensive in terms of computation and memory.
-//
-// Note that in dense implementation of this algorithm, mg, ms, and mom will
-// update even if the grad is zero, but in this sparse implementation, mg, ms,
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// mean_grad = decay * mean_grad + (1-decay) * gradient
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
-//
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	mg: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
-//
-// Returns the created operation.
-func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyCenteredRMSProp",
-		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
-type ResourceApplyAdadeltaAttr func(optionalAttr)
-
-// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var, accum and update_accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the adadelta scheme.
-//
-// accum = rho() * accum + (1 - rho()) * grad.square();
-// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
-// update_accum = rho() * update_accum + (1 - rho()) * update.square();
-// var -= update;
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	accum_update: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdadelta",
-		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Returns the number of records this Reader has produced.
 //
 // This is the same as the number of ReaderRead executions that have
@@ -21973,2173 +38115,58 @@ func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_
 	return op.Output(0)
 }
 
-// Creates a dataset that emits `components` as a tuple of tensors once.
-func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "TensorDataset",
-		Input: []tf.Input{
-			tf.OutputList(components),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Inverse 2D real-valued fast Fourier transform.
+// Reshapes a quantized tensor as per the Reshape op.
 //
-// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most 2 dimensions of `input`.
-//
-// The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
-// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-// from the size of the inner-most 2 dimensions of `input`. If the FFT length used
-// to compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
-//
-// Along each axis `IRFFT2D` is computed on, if `fft_length` (or
-// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
-//
-// Returns A float32 tensor of the same rank as `input`. The inner-most 2
-//   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 2D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft2
-// @end_compatibility
-func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IRFFT2D",
-		Input: []tf.Input{
-			input, fft_length,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StringJoinAttr is an optional argument to StringJoin.
-type StringJoinAttr func(optionalAttr)
-
-// StringJoinSeparator sets the optional separator attribute to value.
-//
-// value: string, an optional join separator.
-// If not specified, defaults to ""
-func StringJoinSeparator(value string) StringJoinAttr {
-	return func(m optionalAttr) {
-		m["separator"] = value
-	}
-}
-
-// Joins the strings in the given list of string tensors into one tensor;
-//
-// with the given separator (default is an empty separator).
-//
-// Arguments:
-//	inputs: A list of string tensors.  The tensors must all have the same shape,
-// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
-// of non-scalar inputs.
-func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StringJoin",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MultinomialAttr is an optional argument to Multinomial.
-type MultinomialAttr func(optionalAttr)
-
-// MultinomialSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 is set to be non-zero, the internal random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
-// If not specified, defaults to 0
-func MultinomialSeed(value int64) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// MultinomialSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func MultinomialSeed2(value int64) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// MultinomialOutputDtype sets the optional output_dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["output_dtype"] = value
-	}
-}
-
-// Draws samples from a multinomial distribution.
-//
-// Arguments:
-//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
-// represents the unnormalized log probabilities for all classes.
-//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
-//
-// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
-// contains the drawn class labels with range `[0, num_classes)`.
-func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Multinomial",
-		Input: []tf.Input{
-			logits, num_samples,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TryRpcAttr is an optional argument to TryRpc.
-type TryRpcAttr func(optionalAttr)
-
-// TryRpcProtocol sets the optional protocol attribute to value.
-//
-// value: RPC protocol to use.  Empty string means use the default protocol.
-// Options include 'grpc'.
-// If not specified, defaults to ""
-func TryRpcProtocol(value string) TryRpcAttr {
-	return func(m optionalAttr) {
-		m["protocol"] = value
-	}
-}
-
-// TryRpcFailFast sets the optional fail_fast attribute to value.
-//
-// value: `boolean`. If `true` (default), then failures to connect
-// (i.e., the server does not immediately respond) cause an RPC failure.
-// If not specified, defaults to true
-func TryRpcFailFast(value bool) TryRpcAttr {
-	return func(m optionalAttr) {
-		m["fail_fast"] = value
-	}
-}
-
-// TryRpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
-//
-// value: `int`. If `0` (default), then the kernel will run the RPC
-// request and only time out if the RPC deadline passes or the session times out.
-// If this value is greater than `0`, then the op will raise an exception if
-// the RPC takes longer than `timeout_in_ms`.
-// If not specified, defaults to 0
-func TryRpcTimeoutInMs(value int64) TryRpcAttr {
-	return func(m optionalAttr) {
-		m["timeout_in_ms"] = value
-	}
-}
-
-// Perform batches of RPC requests.
-//
-// This op asynchronously performs either a single RPC request, or a batch
-// of requests.  RPC requests are defined by three main parameters:
-//
-//   - `address` (the host+port or BNS address of the request)
-//   - `method` (the method name for the request)
-//   - `request` (the serialized proto string, or vector of strings,
-//      of the RPC request argument).
-//
-// For example, if you have an RPC service running on port localhost:2345,
-// and its interface is configured with the following proto declaration:
-//
-// ```
-// service MyService {
-//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
-//   }
-// };
-// ```
-//
-// then call this op with arguments:
-//
-// ```
-// address = "localhost:2345"
-// method = "MyService/MyMethod"
-// ```
-//
-// The `request` tensor is a string tensor representing serialized `MyRequestProto`
-// strings; and the output string tensor `response` will have the same shape
-// and contain (upon successful completion) corresponding serialized
-// `MyResponseProto` strings.
-//
-// For example, to send a single, empty, `MyRequestProto`, call
-// this op with `request = ""`.  To send 5 **parallel** empty requests,
-// call this op with `request = ["", "", "", "", ""]`.
-//
-// More generally, one can create a batch of `MyRequestProto` serialized protos
-// from regular batched tensors using the `encode_proto` op, and convert
-// the response `MyResponseProto` serialized protos to batched tensors
-// using the `decode_proto` op.
-//
-// **NOTE** Working with serialized proto strings is faster than instantiating
-// actual proto objects in memory, so no performance degradation is expected
-// compared to writing custom kernels for this workflow.
-//
-// Unlike the standard `Rpc` op, if the connection fails or the remote worker
-// returns an error status, this op does **not** reraise the exception.
-// Instead, the `status_code` and `status_message` entry for the corresponding RPC
-// call is set with the error returned from the RPC call.  The `response` tensor
-// will contain valid response values for those minibatch entries whose RPCs did
-// not fail; the rest of the entries will have empty strings.
-//
-// Arguments:
-//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `method` and `request`.
-//	method: `0-D` or `1-D`.  The method address on the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `request`.
-//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `method`.
-//
-// Returns Same shape as `request`. Serialized proto strings: the rpc responses.Same shape as `request`.  Values correspond to tensorflow Status enum codes.Same shape as `request`.  Values correspond to Status messages
-// returned from the RPC calls.
-func TryRpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...TryRpcAttr) (response tf.Output, status_code tf.Output, status_message tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TryRpc",
-		Input: []tf.Input{
-			address, method, request,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
-//
-// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Maximum",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the absolute value of a tensor.
-//
-// Given a tensor `x`, this operation returns a tensor containing the absolute
-// value of each element in `x`. For example, if x is an input element and y is
-// an output element, this operation computes \\(y = |x|\\).
-func Abs(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Abs",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// EnqueueTPUEmbeddingIntegerBatchAttr is an optional argument to EnqueueTPUEmbeddingIntegerBatch.
-type EnqueueTPUEmbeddingIntegerBatchAttr func(optionalAttr)
-
-// EnqueueTPUEmbeddingIntegerBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. Should be >= 0 and less than the number
-// of TPU cores in the task on which the node is placed.
-// If not specified, defaults to -1
-func EnqueueTPUEmbeddingIntegerBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingIntegerBatchAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// An op that enqueues a list of input batch tensors to TPUEmbedding.
-//
-// Arguments:
-//	batch: A list of 1D tensors, one for each embedding table, containing the
-// indices into the tables.
-//	mode_override: A string input that overrides the mode specified in the
-// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
-// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
-// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
-//
-// Returns the created operation.
-func EnqueueTPUEmbeddingIntegerBatch(scope *Scope, batch []tf.Output, mode_override tf.Output, optional ...EnqueueTPUEmbeddingIntegerBatchAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EnqueueTPUEmbeddingIntegerBatch",
-		Input: []tf.Input{
-			tf.OutputList(batch), mode_override,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Concatenates tensors along one dimension.
-//
-// Arguments:
-//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [0, rank(values)).
-//	values: The `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.
-func Concat(scope *Scope, concat_dim tf.Output, values []tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Concat",
-		Input: []tf.Input{
-			concat_dim, tf.OutputList(values),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingADAMParametersGradAccumDebug.
-type RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve ADAM embedding parameters with debug support.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the ADAM optimization algorithm.Parameter momenta updated by the ADAM optimization algorithm.Parameter velocities updated by the ADAM optimization algorithm.Parameter gradient_accumulators updated by the ADAM optimization algorithm.
-func RetrieveTPUEmbeddingADAMParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, velocities tf.Output, gradient_accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// Inverse 3D real-valued fast Fourier transform.
-//
-// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most 3 dimensions of `input`.
-//
-// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
-// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
-// to compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
-//
-// Along each axis `IRFFT3D` is computed on, if `fft_length` (or
-// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
-//
-// Returns A float32 tensor of the same rank as `input`. The inner-most 3
-//   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 3D real Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.irfftn with 3 dimensions.
-// @end_compatibility
-func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IRFFT3D",
-		Input: []tf.Input{
-			input, fft_length,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SerializeSparseAttr is an optional argument to SerializeSparse.
-type SerializeSparseAttr func(optionalAttr)
-
-// SerializeSparseOutType sets the optional out_type attribute to value.
-//
-// value: The `dtype` to use for serialization; the supported types are `string`
-// (default) and `variant`.
-// If not specified, defaults to DT_STRING
-func SerializeSparseOutType(value tf.DataType) SerializeSparseAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Serialize a `SparseTensor` into a `[3]` `Tensor` object.
-//
-// Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
-func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeSparseAttr) (serialized_sparse tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SerializeSparse",
-		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
-type ResourceApplyFtrlAttr func(optionalAttr)
-
-// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the Ftrl-proximal scheme.
-//
-// accum_new = accum + grad * grad
-// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 regulariation. Must be a scalar.
-//	lr_power: Scaling factor. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrl",
-		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, lr_power,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Reshapes a SparseTensor to represent values in a new dense shape.
-//
-// This operation has the same semantics as reshape on the represented dense
-// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
-//
-// If one component of `new_shape` is the special value -1, the size of that
-// dimension is computed so that the total dense size remains constant.  At
-// most one component of `new_shape` can be -1.  The number of dense elements
-// implied by `new_shape` must be the same as the number of dense elements
-// originally implied by `input_shape`.
-//
-// Reshaping does not affect the order of values in the SparseTensor.
-//
-// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
-// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
-// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
-// `output_shape` has length `R_out`.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
-// SparseTensor.
-//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
-//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
-//
-// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
-// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
-// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
-// filled in.
-func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseReshape",
-		Input: []tf.Input{
-			input_indices, input_shape, new_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// RealAttr is an optional argument to Real.
-type RealAttr func(optionalAttr)
-
-// RealTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func RealTout(value tf.DataType) RealAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Returns the real part of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the real part of each element in `input`. All elements in
-// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
-//  part returned by this operation and *b* is the imaginary part.
-//
-// For example:
-//
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.real(input) ==> [-2.25, 3.25]
-// ```
-func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Real",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
-type FractionalMaxPoolGradAttr func(optionalAttr)
-
-// FractionalMaxPoolGradOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
-//
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [20, 16] for fractional max pooling.
-// If not specified, defaults to false
-func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
-	}
-}
-
-// Computes gradient of the FractionalMaxPool function.
-//
-// Arguments:
-//	orig_input: Original input for `fractional_max_pool`
-//	orig_output: Original output for `fractional_max_pool`
-//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-// w.r.t. the output of `fractional_max_pool`.
-//	row_pooling_sequence: row pooling sequence, form pooling region with
-// col_pooling_sequence.
-//	col_pooling_sequence: column pooling sequence, form pooling region with
-// row_pooling sequence.
-//
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
-func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FractionalMaxPoolGrad",
-		Input: []tf.Input{
-			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the maximum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the max is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
-// </div>
-//
-// For example:
-//
-// ```
-// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
-// tf.segment_max(c, tf.constant([0, 0, 1]))
-// # ==> [[4, 3, 3, 4],
-// #      [5, 6, 7, 8]]
-// ```
-//
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SegmentMax",
-		Input: []tf.Input{
-			data, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
-type ResourceApplyFtrlV2Attr func(optionalAttr)
-
-// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the Ftrl-proximal scheme.
-//
-// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-// linear += grad_with_shrinkage +
-//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 shrinkage regulariation. Must be a scalar.
-//
-//	lr_power: Scaling factor. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrlV2",
-		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Creates a tree ensemble model and returns a handle to it.
-//
-// Arguments:
-//	tree_ensemble_handle: Handle to the tree ensemble resource to be created.
-//	stamp_token: Token to use as the initial value of the resource stamp.
-//	tree_ensemble_serialized: Serialized proto of the tree ensemble.
-//
-// Returns the created operation.
-func BoostedTreesCreateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesCreateEnsemble",
-		Input: []tf.Input{
-			tree_ensemble_handle, stamp_token, tree_ensemble_serialized,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Bucketize each feature based on bucket boundaries.
-//
-// An op that returns a list of float tensors, where each tensor represents the
-// bucketized values for a single feature.
-//
-// Arguments:
-//	float_values: float; List of Rank 1 Tensor each containing float values for a single feature.
-//	bucket_boundaries: float; List of Rank 1 Tensors each containing the bucket boundaries for a single
-// feature.
-//
-// Returns int; List of Rank 1 Tensors each containing the bucketized values for a single feature.
-func BoostedTreesBucketize(scope *Scope, float_values []tf.Output, bucket_boundaries []tf.Output) (buckets []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesBucketize",
-		Input: []tf.Input{
-			tf.OutputList(float_values), tf.OutputList(bucket_boundaries),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if buckets, idx, err = makeOutputList(op, idx, "buckets"); err != nil {
-		scope.UpdateErr("BoostedTreesBucketize", err)
-		return
-	}
-	return buckets
-}
-
-// Set a summary_writer_interface to record statistics using given stats_aggregator.
-//
-// Returns the created operation.
-func StatsAggregatorSetSummaryWriter(scope *Scope, stats_aggregator tf.Output, summary tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "StatsAggregatorSetSummaryWriter",
-		Input: []tf.Input{
-			stats_aggregator, summary,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
-type ResourceApplyMomentumAttr func(optionalAttr)
-
-// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, the tensor passed to compute grad will be
-// var - lr * momentum * accum, so in the end, the var you get is actually
-// var - lr * momentum * accum.
-// If not specified, defaults to false
-func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
-//
-// want to use Nesterov momentum.
-//
-// accum = accum * momentum + grad
-// var -= lr * accum
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
-//	momentum: Momentum. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyMomentum",
-		Input: []tf.Input{
-			var_, accum, lr, grad, momentum,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Broadcast an array for a compatible shape.
-//
-// Broadcasting is the process of making arrays to have compatible shapes
-// for arithmetic operations. Two shapes are compatible if for each
-// dimension pair they are either equal or one of them is one. When trying
-// to broadcast a Tensor to a shape, it starts with the trailing dimensions,
-// and works its way forward.
-//
-// For example,
-//
-// ```python
-// >>> x = tf.constant([1, 2, 3])
-// >>> y = tf.broadcast_to(x, [3, 3])
-// >>> sess.run(y)
-// array([[1, 2, 3],
-//        [1, 2, 3],
-//        [1, 2, 3]], dtype=int32)
-// ```
-//
-// In the above example, the input Tensor with the shape of `[1, 3]`
-// is broadcasted to output Tensor with shape of `[3, 3]`.
-//
-// Arguments:
-//	input: A Tensor to broadcast.
-//	shape: An 1-D `int` Tensor. The shape of the desired output.
-//
-// Returns A Tensor.
-func BroadcastTo(scope *Scope, input tf.Output, shape tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BroadcastTo",
-		Input: []tf.Input{
-			input, shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
-type RandomShuffleQueueV2Attr func(optionalAttr)
-
-// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
-//
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shapes"] = value
-	}
-}
-
-// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
-//
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
-//
-// value: Dequeue will block unless there would be this
-// many elements after the dequeue or the queue is closed. This
-// ensures a minimum level of mixing of elements.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["min_after_dequeue"] = value
-	}
-}
-
-// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 is set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// RandomShuffleQueueV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A queue that randomizes the order of elements.
-//
-// Arguments:
-//	component_types: The type of each component in a value.
-//
-// Returns The handle to the queue.
-func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"component_types": component_types}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomShuffleQueueV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the truth value of (x > y) element-wise.
-//
-// *NOTE*: `Greater` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Greater",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SparseReduceMaxSparseAttr is an optional argument to SparseReduceMaxSparse.
-type SparseReduceMaxSparseAttr func(optionalAttr)
-
-// SparseReduceMaxSparseKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceMaxSparseKeepDims(value bool) SparseReduceMaxSparseAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the max of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
-// SparseTensor.
-//
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
-//
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseReduceMaxSparse",
-		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Computes the determinant of one or more square matrices.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor containing the determinants
-// for all input submatrices `[..., :, :]`.
-//
-// Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[...]`.
-func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixDeterminant",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Produces the max pool of the input tensor for quantized types.
-//
-// Arguments:
-//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	ksize: The size of the window for each dimension of the input tensor.
-// The length must be 4 to match the number of dimensions of the input.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor. The length must be 4 to match the number of dimensions of the input.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "QuantizedMaxPool",
-		Input: []tf.Input{
-			input, min_input, max_input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns the cardinality of `input_dataset`.
-//
-// Returns the cardinality of `input_dataset`.
-//
-// Arguments:
-//	input_dataset: A variant tensor representing the dataset to return cardinality for.
-//
-// Returns The cardinality of `input_dataset`. Named constants are used to represent
-// infinite and unknown cardinality.
-func ExperimentalDatasetCardinality(scope *Scope, input_dataset tf.Output) (cardinality tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalDatasetCardinality",
-		Input: []tf.Input{
-			input_dataset,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes inverse hyperbolic tangent of x element-wise.
-func Atanh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Atanh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the minimum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the min is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
-// </div>
-//
-// For example:
-//
-// ```
-// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
-// tf.segment_min(c, tf.constant([0, 0, 1]))
-// # ==> [[1, 2, 2, 1],
-// #      [5, 6, 7, 8]]
 // ```
 //
 // Arguments:
 //
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
+//	shape: Defines the shape of the output tensor.
+//	input_min: The minimum value of the input.
+//	input_max: The maximum value of the input.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns This value is copied from input_min.This value is copied from input_max.
+func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min tf.Output, input_max tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentMin",
+		Type: "QuantizedReshape",
 		Input: []tf.Input{
-			data, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.
-type RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve Momentum embedding parameters with debug support.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the Momentum optimization algorithm.Parameter momenta updated by the Momentum optimization algorithm.Parameter gradient_accumulators updated by the Momentum optimization algorithm.
-func RetrieveTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// ShapeAttr is an optional argument to Shape.
-type ShapeAttr func(optionalAttr)
-
-// ShapeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func ShapeOutType(value tf.DataType) ShapeAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Returns the shape of a tensor.
-//
-// This operation returns a 1-D integer tensor representing the shape of `input`.
-//
-// For example:
-//
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// shape(t) ==> [2, 2, 3]
-// ```
-func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Shape",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CTCGreedyDecoderAttr is an optional argument to CTCGreedyDecoder.
-type CTCGreedyDecoderAttr func(optionalAttr)
-
-// CTCGreedyDecoderMergeRepeated sets the optional merge_repeated attribute to value.
-//
-// value: If True, merge repeated classes in output.
-// If not specified, defaults to false
-func CTCGreedyDecoderMergeRepeated(value bool) CTCGreedyDecoderAttr {
-	return func(m optionalAttr) {
-		m["merge_repeated"] = value
-	}
-}
-
-// Performs greedy decoding on the logits given in inputs.
-//
-// A note about the attribute merge_repeated: if enabled, when
-// consecutive logits' maximum indices are the same, only the first of
-// these is emitted.  Labeling the blank '*', the sequence "A B B * B B"
-// becomes "A B B" if merge_repeated = True and "A B B B B" if
-// merge_repeated = False.
-//
-// Regardless of the value of merge_repeated, if the maximum index of a given
-// time and batch corresponds to the blank, index `(num_classes - 1)`, no new
-// element is emitted.
-//
-// Arguments:
-//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
-//	sequence_length: A vector containing sequence lengths, size `(batch_size)`.
-//
-// Returns Indices matrix, size `(total_decoded_outputs x 2)`,
-// of a `SparseTensor<int64, 2>`.  The rows store: [batch, time].Values vector, size: `(total_decoded_outputs)`,
-// of a `SparseTensor<int64, 2>`.  The vector stores the decoded classes.Shape vector, size `(2)`, of the decoded SparseTensor.
-// Values are: `[batch_size, max_decoded_length]`.Matrix, size `(batch_size x 1)`, containing sequence
-// log-probabilities.
-func CTCGreedyDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, optional ...CTCGreedyDecoderAttr) (decoded_indices tf.Output, decoded_values tf.Output, decoded_shape tf.Output, log_probability tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CTCGreedyDecoder",
-		Input: []tf.Input{
-			inputs, sequence_length,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// Records the bytes size of each element of `input_dataset` in a StatsAggregator.
-func ExperimentalBytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalBytesProducedStatsDataset",
-		Input: []tf.Input{
-			input_dataset, tag,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Adjust the saturation of one or more images.
-//
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpretted as channels, and must be three.
-//
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A scale is then applied all the saturation
-// values, and then remapped back to RGB colorspace.
-//
-// Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	scale: A float scale to add to the saturation.
-//
-// Returns The hue-adjusted image or images.
-func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AdjustSaturation",
-		Input: []tf.Input{
-			images, scale,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
-type DataFormatDimMapAttr func(optionalAttr)
-
-// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
-//
-// value: source data format.
-// If not specified, defaults to "NHWC"
-func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
-	return func(m optionalAttr) {
-		m["src_format"] = value
-	}
-}
-
-// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
-//
-// value: destination data format.
-// If not specified, defaults to "NCHW"
-func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
-	return func(m optionalAttr) {
-		m["dst_format"] = value
-	}
-}
-
-// Returns the dimension index in the destination data format given the one in
-//
-// the source data format.
-//
-// Arguments:
-//	x: A Tensor with each element as a dimension index in source data format.
-// Must be in the range [-4, 4).
-//
-// Returns A Tensor with each element as a dimension index in destination data format.
-func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DataFormatDimMap",
-		Input: []tf.Input{
-			x,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MaxPoolGradWithArgmaxAttr is an optional argument to MaxPoolGradWithArgmax.
-type MaxPoolGradWithArgmaxAttr func(optionalAttr)
-
-// MaxPoolGradWithArgmaxIncludeBatchInIndex sets the optional include_batch_in_index attribute to value.
-//
-// value: Whether to include batch dimension in flattened index of `argmax`.
-// If not specified, defaults to false
-func MaxPoolGradWithArgmaxIncludeBatchInIndex(value bool) MaxPoolGradWithArgmaxAttr {
-	return func(m optionalAttr) {
-		m["include_batch_in_index"] = value
-	}
-}
-
-// Computes gradients of the maxpooling function.
-//
-// Arguments:
-//	input: The original input.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
-// output of `max_pool`.
-//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns Gradients w.r.t. the input of `max_pool`.
-func MaxPoolGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradWithArgmaxAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolGradWithArgmax",
-		Input: []tf.Input{
-			input, grad, argmax,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// InfeedEnqueueTupleAttr is an optional argument to InfeedEnqueueTuple.
-type InfeedEnqueueTupleAttr func(optionalAttr)
-
-// InfeedEnqueueTupleLayouts sets the optional layouts attribute to value.
-//
-// value: A vector holding the requested layout in minor-to-major sequence for
-// all the tuple shapes, in the order the shapes appear in the "shapes" input.
-// The layout elements for a sub-shape can be set to -1, in which case the
-// corresponding layout will be computed by the infeed operation.
-// If not specified, defaults to <>
-func InfeedEnqueueTupleLayouts(value []int64) InfeedEnqueueTupleAttr {
-	return func(m optionalAttr) {
-		m["layouts"] = value
-	}
-}
-
-// InfeedEnqueueTupleDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. This should be -1 when the Op
-// is running on a TPU device, and >= 0 when the Op is running on the CPU
-// device.
-// If not specified, defaults to -1
-func InfeedEnqueueTupleDeviceOrdinal(value int64) InfeedEnqueueTupleAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// Feeds multiple Tensor values into the computation as an XLA tuple.
-//
-// Arguments:
-//	inputs: A list of tensors that will be provided using the infeed mechanism.
-//	shapes: The shapes of each tensor in `inputs`.
-//
-// Returns the created operation.
-func InfeedEnqueueTuple(scope *Scope, inputs []tf.Output, shapes []tf.Shape, optional ...InfeedEnqueueTupleAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shapes": shapes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "InfeedEnqueueTuple",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
-type StatelessRandomUniformAttr func(optionalAttr)
-
-// StatelessRandomUniformDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs deterministic pseudorandom random values from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
-//
-// The outputs are a deterministic function of `shape` and `seed`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//
-// Returns Random values with specified shape.
-func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniform",
-		Input: []tf.Input{
-			shape, seed,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Enqueue multiple Tensor values on the computation outfeed.
-//
-// Arguments:
-//	inputs: A list of tensors that will be inserted into the outfeed queue as an
-// XLA tuple.
-//
-// Returns the created operation.
-func OutfeedEnqueueTuple(scope *Scope, inputs []tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "OutfeedEnqueueTuple",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes softplus gradients for a softplus operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding softplus operation.
-//	features: The features passed as input to the corresponding softplus operation.
-//
-// Returns The gradients: `gradients / (1 + exp(-features))`.
-func SoftplusGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SoftplusGrad",
-		Input: []tf.Input{
-			gradients, features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizedDepthwiseConv2DWithBiasAndReluAttr is an optional argument to QuantizedDepthwiseConv2DWithBiasAndRelu.
-type QuantizedDepthwiseConv2DWithBiasAndReluAttr func(optionalAttr)
-
-// QuantizedDepthwiseConv2DWithBiasAndReluOutType sets the optional out_type attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_QINT32
-func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
-//
-// value: List of dilation values.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes quantized depthwise Conv2D with Bias and Relu.
-//
-// Arguments:
-//	input: The original input tensor.
-//	filter: The original filter tensor.
-//	bias: The original bias tensor.
-//	min_input: The float value that the minimum quantized input value represents.
-//	max_input: The float value that the maximum quantized input value represents.
-//	min_filter: The float value that the minimum quantized filter value represents.
-//	max_filter: The float value that the maximum quantized filter value represents.
-//	strides: List of stride values.
-//
-//
-// Returns The output tensor.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizedDepthwiseConv2DWithBiasAndRelu(scope *Scope, input tf.Output, filter tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedDepthwiseConv2DWithBiasAndReluAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedDepthwiseConv2DWithBiasAndRelu",
-		Input: []tf.Input{
-			input, filter, bias, min_input, max_input, min_filter, max_filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// CompilationResultProto indicating the status of the TPU compilation.
-func TPUCompilationResult(scope *Scope) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TPUCompilationResult",
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AssertAttr is an optional argument to Assert.
-type AssertAttr func(optionalAttr)
-
-// AssertSummarize sets the optional summarize attribute to value.
-//
-// value: Print this many entries of each tensor.
-// If not specified, defaults to 3
-func AssertSummarize(value int64) AssertAttr {
-	return func(m optionalAttr) {
-		m["summarize"] = value
-	}
-}
-
-// Asserts that the given condition is true.
-//
-// If `condition` evaluates to false, print the list of tensors in `data`.
-// `summarize` determines how many entries of the tensors to print.
-//
-// Arguments:
-//	condition: The condition to evaluate.
-//	data: The tensors to print out when condition is false.
-//
-// Returns the created operation.
-func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Assert",
-		Input: []tf.Input{
-			condition, tf.OutputList(data),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Adds two `SparseTensor` objects to produce another `SparseTensor`.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in standard
-// lexicographic order.  If this is not the case, before this step run
-// `SparseReorder` to restore index ordering.
-//
-// By default, if two values sum to zero at some index, the output `SparseTensor`
-// would still include that particular location in its index, storing a zero in the
-// corresponding value slot.  To override this, callers can specify `thresh`,
-// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
-// corresponding value and index would then not be included.  In particular,
-// `thresh == 0` (default) means everything is kept and actual thresholding happens
-// only for a positive value.
-//
-// In the following shapes, `nnz` is the count after taking `thresh` into account.
-//
-// Arguments:
-//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
-//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
-//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
-//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
-// pair takes space.
-func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseAdd",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
+			tensor, shape, input_min, input_max,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// FakeQuantWithMinMaxArgsGradientAttr is an optional argument to FakeQuantWithMinMaxArgsGradient.
-type FakeQuantWithMinMaxArgsGradientAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxArgsGradientMin sets the optional min attribute to value.
-// If not specified, defaults to -6
-func FakeQuantWithMinMaxArgsGradientMin(value float32) FakeQuantWithMinMaxArgsGradientAttr {
-	return func(m optionalAttr) {
-		m["min"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsGradientMax sets the optional max attribute to value.
-// If not specified, defaults to 6
-func FakeQuantWithMinMaxArgsGradientMax(value float32) FakeQuantWithMinMaxArgsGradientAttr {
-	return func(m optionalAttr) {
-		m["max"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsGradientNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxArgsGradientNumBits(value int64) FakeQuantWithMinMaxArgsGradientAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsGradientNarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxArgsGradientNarrowRange(value bool) FakeQuantWithMinMaxArgsGradientAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
-	}
-}
-
-// Compute gradients for a FakeQuantWithMinMaxArgs operation.
+// Outputs a `Summary` protocol buffer with a histogram.
+//
+// The generated
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// has one summary value containing a histogram for `values`.
+//
+// This op reports an `InvalidArgument` error if any value is not finite.
 //
 // Arguments:
-//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxArgs operation.
-//	inputs: Values passed as inputs to the FakeQuantWithMinMaxArgs operation.
+//	tag: Scalar.  Tag to use for the `Summary.Value`.
+//	values: Any shape. Values to use to build the histogram.
 //
-// Returns Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:
-// `gradients * (inputs >= min && inputs <= max)`.
-func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsGradientAttr) (backprops tf.Output) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxArgsGradient",
+		Type: "HistogramSummary",
 		Input: []tf.Input{
-			gradients, inputs,
+			tag, values,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// An op that receives embedding activations on the TPU.
-//
-// The TPU system performs the embedding lookups and aggregations specified by
-// the arguments to TPUEmbeddingEnqueue(Integer/Sparse/SparseTensor)Batch. The
-// results of these aggregations are visible to the Tensorflow Graph as the
-// outputs of a RecvTPUEmbeddingActivations op. This op returns a list containing
-// one Tensor of activations per table specified in the model. There can be at
-// most one RecvTPUEmbeddingActivations op in the TPU graph.
-//
-// Arguments:
-//	num_outputs: The number of output activation tensors, equal to the number of
-// embedding tables in the model.
-//	config: Serialized TPUEmbeddingConfiguration proto.
-//
-// Returns A TensorList of embedding activations containing one Tensor per
-// embedding table in the model.
-func RecvTPUEmbeddingActivations(scope *Scope, num_outputs int64, config string) (outputs []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_outputs": num_outputs, "config": config}
-	opspec := tf.OpSpec{
-		Type: "RecvTPUEmbeddingActivations",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("RecvTPUEmbeddingActivations", err)
-		return
-	}
-	return outputs
-}
-
-// TruncatedNormalAttr is an optional argument to TruncatedNormal.
-type TruncatedNormalAttr func(optionalAttr)
-
-// TruncatedNormalSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from a truncated normal distribution.
-//
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
-//
-// Returns A tensor of the specified shape filled with random truncated normal
-// values.
-func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TruncatedNormal",
-		Input: []tf.Input{
-			shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
-type ResourceScatterNdUpdateAttr func(optionalAttr)
-
-// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
-//
-// value: An optional bool. Defaults to True. If True, the assignment will
-// be protected by a lock; otherwise the behavior is undefined,
-// but may exhibit less contention.
-// If not specified, defaults to true
-func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Applies sparse `updates` to individual values or slices within a given
-//
-// variable according to `indices`.
-//
-// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `ref`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `ref`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-// ```
-//
-// For example, say we want to update 4 scattered elements to a rank-1 tensor to
-// 8 elements. In Python, that update would look like this:
-//
-// ```python
-//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-//     indices = tf.constant([[4], [3], [1] ,[7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     update = tf.scatter_nd_update(ref, indices, updates)
-//     with tf.Session() as sess:
-//       print sess.run(update)
-// ```
-//
-// The resulting update to ref would look like this:
-//
-//     [1, 11, 3, 10, 9, 6, 7, 12]
-//
-// See `tf.scatter_nd` for more details about how to make updates to
-// slices.
-//
-// Arguments:
-//	ref: A resource handle. Must be from a VarHandleOp.
-//	indices: A Tensor. Must be one of the following types: int32, int64.
-// A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as ref. A tensor of updated
-// values to add to ref.
-//
-// Returns the created operation.
-func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterNdUpdate",
-		Input: []tf.Input{
-			ref, indices, updates,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// LeakyReluAttr is an optional argument to LeakyRelu.
-type LeakyReluAttr func(optionalAttr)
-
-// LeakyReluAlpha sets the optional alpha attribute to value.
-// If not specified, defaults to 0.2
-func LeakyReluAlpha(value float32) LeakyReluAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
-	}
-}
-
-// Computes rectified linear: `max(features, features * alpha)`.
-func LeakyRelu(scope *Scope, features tf.Output, optional ...LeakyReluAttr) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LeakyRelu",
-		Input: []tf.Input{
-			features,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ShapeNAttr is an optional argument to ShapeN.
-type ShapeNAttr func(optionalAttr)
-
-// ShapeNOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func ShapeNOutType(value tf.DataType) ShapeNAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Returns shape of tensors.
-//
-// This operation returns N 1-D integer tensors representing shape of `input[i]s`.
-func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ShapeN",
-		Input: []tf.Input{
-			tf.OutputList(input),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("ShapeN", err)
-		return
-	}
-	return output
-}
-
 // OutfeedDequeueAttr is an optional argument to OutfeedDequeue.
 type OutfeedDequeueAttr func(optionalAttr)
 
@@ -24181,2254 +38208,59 @@ func OutfeedDequeue(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ..
 	return op.Output(0)
 }
 
-// Check if the input matches the regex pattern.
+// Restore a reader to a previously saved state.
 //
-// The input is a string tensor of any shape. The pattern is the
-// regular expression to be matched with every element of the input tensor.
-// The boolean values (True or False) of the output tensor indicate
-// if the input matches the regex pattern provided.
-//
-// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+// Not all Readers support being restored, so this can produce an
+// Unimplemented error.
 //
 // Arguments:
-//	input: A string tensor of the text to be processed.
-//	pattern: The regular expression to match the input.
-//
-// Returns A bool tensor with the same shape as `input`.
-func StaticRegexFullMatch(scope *Scope, input tf.Output, pattern string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"pattern": pattern}
-	opspec := tf.OpSpec{
-		Type: "StaticRegexFullMatch",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
-//
-// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
-// ](http://arxiv.org/abs/1511.07289)
-func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Elu",
-		Input: []tf.Input{
-			features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Slice a `SparseTensor` based on the `start` and `size`.
-//
-// For example, if the input is
-//
-//     input_tensor = shape = [2, 7]
-//     [    a   d e  ]
-//     [b c          ]
-//
-// Graphically the output tensors are:
-//
-//     sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
-//     [    a  ]
-//     [b c    ]
-//
-//     sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
-//     [ d e  ]
-//     [      ]
-//
-// Arguments:
-//	indices: 2-D tensor represents the indices of the sparse tensor.
-//	values: 1-D tensor represents the values of the sparse tensor.
-//	shape: 1-D. tensor represents the shape of the sparse tensor.
-//	start: 1-D. tensor represents the start of the slice.
-//	size: 1-D. tensor represents the size of the slice.
-// output indices: A list of 1-D tensors represents the indices of the output
-// sparse tensors.
-//
-// Returns A list of 1-D tensors represents the values of the output sparse
-// tensors.A list of 1-D tensors represents the shape of the output sparse
-// tensors.
-func SparseSlice(scope *Scope, indices tf.Output, values tf.Output, shape tf.Output, start tf.Output, size tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSlice",
-		Input: []tf.Input{
-			indices, values, shape, start, size,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Computes sigmoid of `x` element-wise.
-//
-// Specifically, `y = 1 / (1 + exp(-x))`.
-func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sigmoid",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes gradients for the scaled exponential linear (Selu) operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Selu operation.
-//	outputs: The outputs of the corresponding Selu operation.
-//
-// Returns The gradients: `gradients * (outputs + scale * alpha)`
-// if outputs < 0, `scale * gradients` otherwise.
-func SeluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SeluGrad",
-		Input: []tf.Input{
-			gradients, outputs,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Draw bounding boxes on a batch of images.
-//
-// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
-// boxes specified by the locations in `boxes`. The coordinates of the each
-// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example, if an image is 100 x 200 pixels (height x width) and the bounding
-// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
-// the bounding box will be `(40, 10)` to `(180, 50)` (in (x,y) coordinates).
-//
-// Parts of the bounding box may fall outside the image.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
-//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
-// boxes.
-//
-// Returns 4-D with the same shape as `images`. The batch of input images with
-// bounding boxes drawn on the images.
-func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DrawBoundingBoxes",
-		Input: []tf.Input{
-			images, boxes,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SparseReduceSumAttr is an optional argument to SparseReduceSum.
-type SparseReduceSumAttr func(optionalAttr)
-
-// SparseReduceSumKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the sum of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
-// instead of a sparse one.
-//
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
-//
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-//
-// Returns `R-K`-D.  The reduced Tensor.
-func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseReduceSum",
-		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingMomentumParametersAttr is an optional argument to RetrieveTPUEmbeddingMomentumParameters.
-type RetrieveTPUEmbeddingMomentumParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingMomentumParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingMomentumParametersTableId(value int64) RetrieveTPUEmbeddingMomentumParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingMomentumParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingMomentumParametersTableName(value string) RetrieveTPUEmbeddingMomentumParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve Momentum embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the Momentum optimization algorithm.Parameter momenta updated by the Momentum optimization algorithm.
-func RetrieveTPUEmbeddingMomentumParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersAttr) (parameters tf.Output, momenta tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingMomentumParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// ParameterizedTruncatedNormalAttr is an optional argument to ParameterizedTruncatedNormal.
-type ParameterizedTruncatedNormalAttr func(optionalAttr)
-
-// ParameterizedTruncatedNormalSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ParameterizedTruncatedNormalSeed(value int64) ParameterizedTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// ParameterizedTruncatedNormalSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ParameterizedTruncatedNormalSeed2(value int64) ParameterizedTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from a normal distribution. The parameters may each be a
-//
-// scalar which applies to the entire output, or a vector of length shape[0] which
-// stores the parameters for each batch.
-//
-// Arguments:
-//	shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
-//	means: The mean parameter of each batch.
-//	stdevs: The standard deviation parameter of each batch. Must be greater than 0.
-//	minvals: The minimum cutoff. May be -infinity.
-//	maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
-// for each batch.
-//
-// Returns A matrix of shape num_batches x samples_per_batch, filled with random
-// truncated normal values using the parameters for each row.
-func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output, stdevs tf.Output, minvals tf.Output, maxvals tf.Output, optional ...ParameterizedTruncatedNormalAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ParameterizedTruncatedNormal",
-		Input: []tf.Input{
-			shape, means, stdevs, minvals, maxvals,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LoadTPUEmbeddingAdagradParametersAttr is an optional argument to LoadTPUEmbeddingAdagradParameters.
-type LoadTPUEmbeddingAdagradParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingAdagradParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingAdagradParametersTableId(value int64) LoadTPUEmbeddingAdagradParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingAdagradParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingAdagradParametersTableName(value string) LoadTPUEmbeddingAdagradParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load Adagrad embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the Adagrad optimization algorithm.
-//	accumulators: Value of accumulators used in the Adagrad optimization algorithm.
-//
-//
+//	reader_handle: Handle to a Reader.
+//	state: Result of a ReaderSerializeState of a Reader with type
+// matching reader_handle.
 //
 // Returns the created operation.
-func LoadTPUEmbeddingAdagradParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdagradParametersAttr) (o *tf.Operation) {
+func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingAdagradParameters",
+		Type: "ReaderRestoreStateV2",
 		Input: []tf.Input{
-			parameters, accumulators,
+			reader_handle, state,
 		},
-		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Creates a dataset that contains the elements of `input_dataset` ignoring errors.
-func ExperimentalIgnoreErrorsDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalIgnoreErrorsDataset",
-		Input: []tf.Input{
-			input_dataset,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// SetSizeAttr is an optional argument to SetSize.
+type SetSizeAttr func(optionalAttr)
 
-// CudnnRNNBackpropV2Attr is an optional argument to CudnnRNNBackpropV2.
-type CudnnRNNBackpropV2Attr func(optionalAttr)
-
-// CudnnRNNBackpropV2RnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNBackpropV2RnnMode(value string) CudnnRNNBackpropV2Attr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNBackpropV2InputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNBackpropV2InputMode(value string) CudnnRNNBackpropV2Attr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
-
-// CudnnRNNBackpropV2Direction sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNBackpropV2Direction(value string) CudnnRNNBackpropV2Attr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
-
-// CudnnRNNBackpropV2Dropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropV2Dropout(value float32) CudnnRNNBackpropV2Attr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
-	}
-}
-
-// CudnnRNNBackpropV2Seed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropV2Seed(value int64) CudnnRNNBackpropV2Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// CudnnRNNBackpropV2Seed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropV2Seed2(value int64) CudnnRNNBackpropV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Backprop step of CudnnRNN.
-//
-// Compute the backprop of both data and weights in a RNN. Takes an extra
-//     "host_reserved" inupt than CudnnRNNBackprop, which is used to determine RNN
-//     cudnnRNNAlgo_t and cudnnMathType_t.
-//
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicates whether there is a linear projection between the input and
-//     the actual computation before the first layer. 'skip_input' is only allowed
-//     when input_size == num_units; 'auto_select' implies 'skip_input' when
-//     input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used. Should be
-//   "unidirectional" or "bidirectional".
-// dropout: Dropout probability. When set to 0., dropout is disabled.
-// seed: The 1st part of a seed to initialize dropout.
-// seed2: The 2nd part of a seed to initialize dropout.
-// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
-// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
-//     num_units].
-// input_c: For LSTM, a 3-D tensor with the shape of
-//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
-// params: A 1-D tensor that contains the weights and biases in an opaque layout.
-//     The size must be created through CudnnRNNParamsSize, and initialized
-//     separately. Note that they might not be compatible across different
-//     generations. So it is a good idea to save and restore
-// output: A 3-D tensor with the shape of [seq_length, batch_size,
-//     dir * num_units].
-// output_h: The same shape has input_h.
-// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
-// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
-//     pass.
-// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
-//     pass.
-// reserve_space: The same reserve_space produced in the forward operation.
-// host_reserved: The same host_reserved produced in the forward operation.
-// input_backprop: The backprop to input in the forward pass. Has the same shape
-//     as input.
-// input_h_backprop: The backprop to input_h in the forward pass. Has the same
-//     shape as input_h.
-// input_c_backprop: The backprop to input_c in the forward pass. Has the same
-//     shape as input_c.
-// params_backprop: The backprop to the params buffer in the forward pass. Has the
-//     same shape as params.
-func CudnnRNNBackpropV2(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, host_reserved tf.Output, optional ...CudnnRNNBackpropV2Attr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CudnnRNNBackpropV2",
-		Input: []tf.Input{
-			input, input_h, input_c, params, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space, host_reserved,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// Inverse 2D fast Fourier transform.
-//
-// Computes the inverse 2-dimensional discrete Fourier transform over the
-// inner-most 2 dimensions of `input`.
-//
-// Arguments:
-//	input: A complex tensor.
-//
-// Returns A complex tensor of the same shape as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their inverse 2D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.ifft2
-// @end_compatibility
-func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IFFT2D",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug.
-type RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve RMSProp embedding parameters with debug support.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the RMSProp optimization algorithm.Parameter ms updated by the RMSProp optimization algorithm.Parameter mom updated by the RMSProp optimization algorithm.Parameter gradient_accumulators updated by the RMSProp optimization algorithm.
-func RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr) (parameters tf.Output, ms tf.Output, mom tf.Output, gradient_accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// MaxPoolWithArgmaxAttr is an optional argument to MaxPoolWithArgmax.
-type MaxPoolWithArgmaxAttr func(optionalAttr)
-
-// MaxPoolWithArgmaxTargmax sets the optional Targmax attribute to value.
-// If not specified, defaults to DT_INT64
-func MaxPoolWithArgmaxTargmax(value tf.DataType) MaxPoolWithArgmaxAttr {
-	return func(m optionalAttr) {
-		m["Targmax"] = value
-	}
-}
-
-// MaxPoolWithArgmaxIncludeBatchInIndex sets the optional include_batch_in_index attribute to value.
-//
-// value: Whether to include batch dimension in flattened index of `argmax`.
-// If not specified, defaults to false
-func MaxPoolWithArgmaxIncludeBatchInIndex(value bool) MaxPoolWithArgmaxAttr {
-	return func(m optionalAttr) {
-		m["include_batch_in_index"] = value
-	}
-}
-
-// Performs max pooling on the input and outputs both max values and indices.
-//
-// The indices in `argmax` are flattened, so that a maximum value at position
-// `[b, y, x, c]` becomes flattened index:
-// `(y * width + x) * channels + c` if `include_batch_in_index` is False;
-// `((b * height + y) * width + x) * channels + c` if `include_batch_in_index` is True.
-//
-// The indices returned are always in `[0, height) x [0, width)` before flattening,
-// even if padding is involved and the mathematically correct answer is outside
-// (either negative or too large).  This is a bug, but fixing it is difficult to do
-// in a safe backwards compatible way, especially due to flattening.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, height, width, channels]`.  Input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The max pooled output tensor.4-D.  The flattened indices of the max values chosen for each output.
-func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolWithArgmaxAttr) (output tf.Output, argmax tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolWithArgmax",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// RetrieveTPUEmbeddingMDLAdagradLightParametersAttr is an optional argument to RetrieveTPUEmbeddingMDLAdagradLightParameters.
-type RetrieveTPUEmbeddingMDLAdagradLightParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingMDLAdagradLightParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingMDLAdagradLightParametersTableId(value int64) RetrieveTPUEmbeddingMDLAdagradLightParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingMDLAdagradLightParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingMDLAdagradLightParametersTableName(value string) RetrieveTPUEmbeddingMDLAdagradLightParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve MDL Adagrad Light embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the MDL Adagrad Light optimization algorithm.Parameter accumulators updated by the MDL Adagrad Light optimization algorithm.Parameter weights updated by the MDL Adagrad Light optimization algorithm.Parameter benefits updated by the MDL Adagrad Light optimization algorithm.
-func RetrieveTPUEmbeddingMDLAdagradLightParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMDLAdagradLightParametersAttr) (parameters tf.Output, accumulators tf.Output, weights tf.Output, benefits tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingMDLAdagradLightParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// ResourceApplyAdaMaxAttr is an optional argument to ResourceApplyAdaMax.
-type ResourceApplyAdaMaxAttr func(optionalAttr)
-
-// ResourceApplyAdaMaxUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, m, and v tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdaMaxUseLocking(value bool) ResourceApplyAdaMaxAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the AdaMax algorithm.
-//
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// v_t <- max(beta2 * v_{t-1}, abs(g))
-// variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	beta1_power: Must be a scalar.
-//	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyAdaMax(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdaMaxAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdaMax",
-		Input: []tf.Input{
-			var_, m, v, beta1_power, lr, beta1, beta2, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// AnyAttr is an optional argument to Any.
-type AnyAttr func(optionalAttr)
-
-// AnyKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func AnyKeepDims(value bool) AnyAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the "logical or" of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func Any(scope *Scope, input tf.Output, axis tf.Output, optional ...AnyAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Any",
-		Input: []tf.Input{
-			input, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Transforms a tf.Example proto (as a string) into typed tensors.
-//
-// Arguments:
-//	serialized: A vector containing a batch of binary serialized Example protos.
-//	dense_defaults: A list of Tensors (some may be empty), whose length matches
-// the length of `dense_keys`. dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	num_sparse: The number of sparse features to be parsed from the example. This
-// must match the lengths of `sparse_keys` and `sparse_types`.
-//	sparse_keys: A list of `num_sparse` strings.
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: The keys expected in the Examples' features associated with dense
-// values.
-//	sparse_types: A list of `num_sparse` types; the data types of data in each
-// Feature given in sparse_keys.
-// Currently the ParseSingleExample op supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	dense_shapes: The shapes of data in each Feature given in dense_keys.
-// The length of this list must match the length of `dense_keys`.  The
-// number of elements in the Feature corresponding to dense_key[j] must
-// always equal dense_shapes[j].NumEntries().  If dense_shapes[j] ==
-// (D0, D1, ..., DN) then the shape of output Tensor dense_values[j]
-// will be (D0, D1, ..., DN): In the case dense_shapes[j] = (-1, D1,
-// ..., DN), the shape of the output Tensor dense_values[j] will be (M,
-// D1, .., DN), where M is the number of blocks of elements of length
-// D1 * .... * DN, in the input.
-func ParseSingleExample(scope *Scope, serialized tf.Output, dense_defaults []tf.Output, num_sparse int64, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_sparse": num_sparse, "sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes}
-	opspec := tf.OpSpec{
-		Type: "ParseSingleExample",
-		Input: []tf.Input{
-			serialized, tf.OutputList(dense_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values
-}
-
-// Returns a batched matrix tensor with new batched diagonal values.
-//
-// Given `input` and `diagonal`, this operation returns a tensor with the
-// same shape and values as `input`, except for the main diagonal of the
-// innermost matrices.  These will be overwritten by the values in `diagonal`.
-//
-// The output is computed as follows:
-//
-// Assume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has
-// `k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a
-// tensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:
-//
-//   * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.
-//   * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`.
-//
-// Arguments:
-//	input: Rank `k+1`, where `k >= 1`.
-//	diagonal: Rank `k`, where `k >= 1`.
-//
-// Returns Rank `k+1`, with `output.shape = input.shape`.
-func MatrixSetDiag(scope *Scope, input tf.Output, diagonal tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixSetDiag",
-		Input: []tf.Input{
-			input, diagonal,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
-type ResourceSparseApplyFtrlAttr func(optionalAttr)
-
-// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
-//
-// That is for rows we have grad for, we update var, accum and linear as follows:
-// accum_new = accum + grad * grad
-// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	lr_power: Scaling factor. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyFtrl",
-		Input: []tf.Input{
-			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes hyperbolic sine of x element-wise.
-func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sinh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// OrderedMapClearAttr is an optional argument to OrderedMapClear.
-type OrderedMapClearAttr func(optionalAttr)
-
-// OrderedMapClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapClearCapacity(value int64) OrderedMapClearAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// OrderedMapClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapClearMemoryLimit(value int64) OrderedMapClearAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapClearContainer(value string) OrderedMapClearAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapClearSharedName(value string) OrderedMapClearAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes all elements in the underlying container.
-//
-// Returns the created operation.
-func OrderedMapClear(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapClearAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "OrderedMapClear",
-
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Creates a TensorList by indexing into a Tensor.
-//
-// Each member of the TensorList corresponds to one row of the input tensor,
-// specified by the given index (see `tf.gather`).
-//
-// tensor: The input tensor.
-// indices: The indices used to index into the list.
-// element_shape: The shape of the elements in the list (can be less specified than
-//   the shape of the tensor).
-// num_elements: The size of the output list. Must be large enough to accommodate
-//   the largest index in indices. If -1, the list is just large enough to include
-//   the largest index in indices.
-// output_handle: The TensorList.
-func TensorListScatterV2(scope *Scope, tensor tf.Output, indices tf.Output, element_shape tf.Output, num_elements tf.Output) (output_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListScatterV2",
-		Input: []tf.Input{
-			tensor, indices, element_shape, num_elements,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that executes a SQL query and emits rows of the result set.
-//
-// Arguments:
-//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
-//	data_source_name: A connection string to connect to the database.
-//	query: A SQL query to execute.
-//
-//
-func ExperimentalSqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalSqlDataset",
-		Input: []tf.Input{
-			driver_name, data_source_name, query,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingAdadeltaParametersAttr is an optional argument to RetrieveTPUEmbeddingAdadeltaParameters.
-type RetrieveTPUEmbeddingAdadeltaParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingAdadeltaParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingAdadeltaParametersTableId(value int64) RetrieveTPUEmbeddingAdadeltaParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingAdadeltaParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingAdadeltaParametersTableName(value string) RetrieveTPUEmbeddingAdadeltaParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve Adadelta embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the Adadelta optimization algorithm.Parameter accumulators updated by the Adadelta optimization algorithm.Parameter updates updated by the Adadelta optimization algorithm.
-func RetrieveTPUEmbeddingAdadeltaParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdadeltaParametersAttr) (parameters tf.Output, accumulators tf.Output, updates tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingAdadeltaParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Outputs deterministic pseudorandom random integers from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[minval, maxval)`.
-//
-// The outputs are a deterministic function of `shape`, `seed`, `minval`, and `maxval`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//	minval: Minimum value (inclusive, scalar).
-//	maxval: Maximum value (exclusive, scalar).
-//
-// Returns Random values with specified shape.
-func StatelessRandomUniformInt(scope *Scope, shape tf.Output, seed tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniformInt",
-		Input: []tf.Input{
-			shape, seed, minval, maxval,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RestoreAttr is an optional argument to Restore.
-type RestoreAttr func(optionalAttr)
-
-// RestorePreferredShard sets the optional preferred_shard attribute to value.
-//
-// value: Index of file to open first if multiple files match
-// `file_pattern`.
-// If not specified, defaults to -1
-func RestorePreferredShard(value int64) RestoreAttr {
-	return func(m optionalAttr) {
-		m["preferred_shard"] = value
-	}
-}
-
-// Restores a tensor from checkpoint files.
-//
-// Reads a tensor stored in one or several files. If there are several files (for
-// instance because a tensor was saved as slices), `file_pattern` may contain
-// wildcard symbols (`*` and `?`) in the filename portion only, not in the
-// directory portion.
-//
-// If a `file_pattern` matches several files, `preferred_shard` can be used to hint
-// in which file the requested tensor is likely to be found. This op will first
-// open the file at index `preferred_shard` in the list of matching files and try
-// to restore tensors from that file.  Only if some tensors or tensor slices are
-// not found in that first file, then the Op opens all the files. Setting
-// `preferred_shard` to match the value passed as the `shard` input
-// of a matching `Save` Op may speed up Restore.  This attribute only affects
-// performance, not correctness.  The default value -1 means files are processed in
-// order.
-//
-// See also `RestoreSlice`.
-//
-// Arguments:
-//	file_pattern: Must have a single element. The pattern of the files from
-// which we read the tensor.
-//	tensor_name: Must have a single element. The name of the tensor to be
-// restored.
-//	dt: The type of the tensor to be restored.
-//
-// Returns The restored tensor.
-func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.DataType, optional ...RestoreAttr) (tensor tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dt": dt}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Restore",
-		Input: []tf.Input{
-			file_pattern, tensor_name,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// OrderedMapSizeAttr is an optional argument to OrderedMapSize.
-type OrderedMapSizeAttr func(optionalAttr)
-
-// OrderedMapSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapSizeCapacity(value int64) OrderedMapSizeAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// OrderedMapSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapSizeMemoryLimit(value int64) OrderedMapSizeAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapSizeContainer(value string) OrderedMapSizeAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapSizeSharedName(value string) OrderedMapSizeAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op returns the number of elements in the underlying container.
-func OrderedMapSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapSizeAttr) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "OrderedMapSize",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Performs gradient updates of embedding tables.
-//
-// Arguments:
-//	inputs: A TensorList of gradients with which to update embedding tables.
-// This argument has the same length and shapes as the return value of
-// RecvTPUEmbeddingActivations, but contains gradients of the model's loss
-// with respect to the embedding activations. The embedding tables are updated
-// from these gradients via the optimizer specified in the TPU embedding
-// configuration given to tpu.initialize_system.
-//	learning_rates: A TensorList of float32 scalars, one for each dynamic learning
-// rate tag: see the comments in
-// //third_party/tensorflow/core/protobuf/tpu/optimization_parameters.proto.
-// Multiple tables can share the same dynamic learning rate tag as specified
-// in the configuration. If the learning rates for all tables are constant,
-// this list should be empty.
-//	config: Serialized TPUEmbeddingConfiguration proto.
-//
-// Returns the created operation.
-func SendTPUEmbeddingGradients(scope *Scope, inputs []tf.Output, learning_rates []tf.Output, config string) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"config": config}
-	opspec := tf.OpSpec{
-		Type: "SendTPUEmbeddingGradients",
-		Input: []tf.Input{
-			tf.OutputList(inputs), tf.OutputList(learning_rates),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// An Op to exchange data across TPU replicas.
-//
-// On each replica, the input is split into `split_count` blocks along
-// `split_dimension` and send to the other replicas given group_assignment. After
-// receiving `split_count` - 1 blocks from other replicas, we concatenate the
-// blocks along `concat_dimension` as the output.
-//
-// For example, suppose there are 2 TPU replicas:
-// replica 0 receives input: `[[A, B]]`
-// replica 1 receives input: `[[C, D]]`
-//
-// group_assignment=`[[0, 1]]`
-// concat_dimension=0
-// split_dimension=1
-// split_count=2
-//
-// replica 0's output: `[[A], [C]]`
-// replica 1's output: `[[B], [D]]`
-//
-// Arguments:
-//	input: The local input to the sum.
-//	group_assignment: An int32 tensor with shape
-// [num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
-// replica ids in the ith subgroup.
-//	concat_dimension: The dimension number to concatenate.
-//	split_dimension: The dimension number to split.
-//	split_count: The number of splits, this number must equal to the sub-group
-// size(group_assignment.get_shape()[1])
-//
-// Returns The exchanged result.
-func AllToAll(scope *Scope, input tf.Output, group_assignment tf.Output, concat_dimension int64, split_dimension int64, split_count int64) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"concat_dimension": concat_dimension, "split_dimension": split_dimension, "split_count": split_count}
-	opspec := tf.OpSpec{
-		Type: "AllToAll",
-		Input: []tf.Input{
-			input, group_assignment,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// L2 Loss.
-//
-// Computes half the L2 norm of a tensor without the `sqrt`:
-//
-//     output = sum(t ** 2) / 2
-//
-// Arguments:
-//	t: Typically 2-D, but may have any dimensions.
-//
-// Returns 0-D.
-func L2Loss(scope *Scope, t tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "L2Loss",
-		Input: []tf.Input{
-			t,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StringFormatAttr is an optional argument to StringFormat.
-type StringFormatAttr func(optionalAttr)
-
-// StringFormatTemplate sets the optional template attribute to value.
-//
-// value: A string, the template to format tensor summaries into.
-// If not specified, defaults to "%s"
-func StringFormatTemplate(value string) StringFormatAttr {
-	return func(m optionalAttr) {
-		m["template"] = value
-	}
-}
-
-// StringFormatPlaceholder sets the optional placeholder attribute to value.
-//
-// value: A string, at each placeholder in the template a subsequent tensor summary will be inserted.
-// If not specified, defaults to "%s"
-func StringFormatPlaceholder(value string) StringFormatAttr {
-	return func(m optionalAttr) {
-		m["placeholder"] = value
-	}
-}
-
-// StringFormatSummarize sets the optional summarize attribute to value.
-//
-// value: When formatting the tensor summaries print the first and last summarize entries of each tensor dimension.
-// If not specified, defaults to 3
-func StringFormatSummarize(value int64) StringFormatAttr {
-	return func(m optionalAttr) {
-		m["summarize"] = value
-	}
-}
-
-// Formats a string template using a list of tensors.
-//
-// Formats a string template using a list of tensors, pretty-printing tensor summaries.
-//
-// Arguments:
-//	inputs: The list of tensors to format into the placeholder string.
-//
-// Returns = The resulting string scalar.
-func StringFormat(scope *Scope, inputs []tf.Output, optional ...StringFormatAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StringFormat",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// An Op to permute tensors across replicated TPU instances.
-//
-// Each instance supplies its own input.
-//
-// For example, suppose there are 4 TPU instances: `[A, B, C, D]`. Passing
-// source_target_pairs=`[[0,1],[1,2],[2,3],[3,0]]` gets the outputs:
-// `[D, A, B, C]`.
-//
-// Arguments:
-//	input: The local input to be permuted. Currently only supports float and
-// bfloat16.
-//	source_target_pairs: A tensor with shape [num_pairs, 2].
-//
-// Returns The permuted input.
-func CollectivePermute(scope *Scope, input tf.Output, source_target_pairs tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "CollectivePermute",
-		Input: []tf.Input{
-			input, source_target_pairs,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
-//
-// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
-// `N` is the minibatch size and the rows correspond to packed outputs of
-// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
-// must all match.  When the final `SparseTensor` is created, it has rank one
-// higher than the ranks of the incoming `SparseTensor` objects
-// (they have been concatenated along a new row dimension).
-//
-// The output `SparseTensor` object's shape values for all dimensions but the
-// first are the max across the input `SparseTensor` objects' shape values
-// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-// size.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the serialized input is a `[2 x 3]` matrix representing two
-// original `SparseTensor` objects:
-//
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-//
-// and
-//
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
-//
-// then the final deserialized `SparseTensor` will be:
-//
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
-//
-// Arguments:
-//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
-// Must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` objects.
-func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "DeserializeManySparse",
-		Input: []tf.Input{
-			serialized_sparse,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Sends `input` to all devices that are connected to the output.
-//
-// Sends `input` to all devices that are connected to the output.
-//
-// The graph should be constructed so that all ops connected to the output have a
-// valid device assignment, and the op itself is assigned one of these devices.
-//
-// input: The input to the broadcast.
-// output: The same as input.
-// shape: The shape of the input tensor.
-//
-func NcclBroadcast(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shape": shape}
-	opspec := tf.OpSpec{
-		Type: "NcclBroadcast",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes gradients for the exponential linear (Elu) operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Elu operation.
-//	outputs: The outputs of the corresponding Elu operation.
-//
-// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
-// `gradients` otherwise.
-func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "EluGrad",
-		Input: []tf.Input{
-			gradients, outputs,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// EnqueueTPUEmbeddingSparseTensorBatchAttr is an optional argument to EnqueueTPUEmbeddingSparseTensorBatch.
-type EnqueueTPUEmbeddingSparseTensorBatchAttr func(optionalAttr)
-
-// EnqueueTPUEmbeddingSparseTensorBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. Should be >= 0 and less than the number
-// of TPU cores in the task on which the node is placed.
-// If not specified, defaults to -1
-func EnqueueTPUEmbeddingSparseTensorBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingSparseTensorBatchAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// EnqueueTPUEmbeddingSparseTensorBatchCombiners sets the optional combiners attribute to value.
-//
-// value: A list of string scalars, one for each embedding table that specify
-// how to normalize the embedding activations after weighted summation.
-// Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
-// the sum of the weights be 0 for 'mean' or the sum of the squared weights be
-// 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
-// all tables.
-// If not specified, defaults to <>
-func EnqueueTPUEmbeddingSparseTensorBatchCombiners(value []string) EnqueueTPUEmbeddingSparseTensorBatchAttr {
-	return func(m optionalAttr) {
-		m["combiners"] = value
-	}
-}
-
-// EnqueueTPUEmbeddingSparseTensorBatchMaxSequenceLengths sets the optional max_sequence_lengths attribute to value.
-// If not specified, defaults to <>
-func EnqueueTPUEmbeddingSparseTensorBatchMaxSequenceLengths(value []int64) EnqueueTPUEmbeddingSparseTensorBatchAttr {
-	return func(m optionalAttr) {
-		m["max_sequence_lengths"] = value
-	}
-}
-
-// Eases the porting of code that uses tf.nn.embedding_lookup_sparse().
-//
-// sample_indices[i], embedding_indices[i] and aggregation_weights[i] correspond
-// to the ith feature. table_ids[i] indicates which embedding table to look up ith
-// feature.
-//
-// The tensors at corresponding positions in the three input lists (sample_indices,
-// embedding_indices and aggregation_weights) must have the same shape, i.e. rank 1
-// with dim_size() equal to the total number of lookups into the table described by
-// the corresponding feature.
-//
-// Arguments:
-//	sample_indices: A list of rank 1 Tensors specifying the training example to
-// which the corresponding embedding_indices and aggregation_weights values
-// belong. It corresponds to sp_ids.indices[:,0] in  embedding_lookup_sparse().
-//	embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
-// It corresponds to sp_ids.values in embedding_lookup_sparse().
-//	aggregation_weights: A list of rank 1 Tensors containing per training example
-// aggregation weights. It corresponds to sp_weights.values in
-// embedding_lookup_sparse().
-//	mode_override: A string input that overrides the mode specified in the
-// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
-// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
-// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
-//	table_ids: A list of integers specifying the identifier of the embedding table
-// (offset of TableDescriptor in the TPUEmbeddingConfiguration) to lookup the
-// corresponding input. The ith input is looked up using table_ids[i]. The size
-// of the table_ids list must be equal to that of sample_indices,
-// embedding_indices and aggregation_weights.
-//
-// Returns the created operation.
-func EnqueueTPUEmbeddingSparseTensorBatch(scope *Scope, sample_indices []tf.Output, embedding_indices []tf.Output, aggregation_weights []tf.Output, mode_override tf.Output, table_ids []int64, optional ...EnqueueTPUEmbeddingSparseTensorBatchAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"table_ids": table_ids}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EnqueueTPUEmbeddingSparseTensorBatch",
-		Input: []tf.Input{
-			tf.OutputList(sample_indices), tf.OutputList(embedding_indices), tf.OutputList(aggregation_weights), mode_override,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Compute the pairwise cross product.
-//
-// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
-// or any shape where the innermost dimension is 3. In the latter case, each pair
-// of corresponding 3-element vectors is cross-multiplied independently.
-//
-// Arguments:
-//	a: A tensor containing 3-element vectors.
-//	b: Another tensor, of same type and shape as `a`.
-//
-// Returns Pairwise cross product of the vectors in `a` and `b`.
-func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Cross",
-		Input: []tf.Input{
-			a, b,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Conv3DBackpropInputV2Attr is an optional argument to Conv3DBackpropInputV2.
-type Conv3DBackpropInputV2Attr func(optionalAttr)
-
-// Conv3DBackpropInputV2DataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv3DBackpropInputV2Dilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of 3-D convolution with respect to the input.
-//
-// Arguments:
-//	input_sizes: An integer vector representing the tensor shape of `input`,
-// where `input` is a 5-D
-// `[batch, depth, rows, cols, in_channels]` tensor.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInputV2",
-		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the reciprocal of x element-wise.
-//
-// I.e., \\(y = 1 / x\\).
-func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Reciprocal",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Scatter the data from the input value into specific TensorArray elements.
-//
-// `indices` must be a vector, its length must match the first dim of `value`.
-//
-// Arguments:
-//	handle: The handle to a TensorArray.
-//	indices: The locations at which to write the tensor elements.
-//	value: The concatenated tensor to write to the TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//
-// Returns A float scalar that enforces proper chaining of operations.
-func TensorArrayScatterV3(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayScatterV3",
-		Input: []tf.Input{
-			handle, indices, value, flow_in,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
-//
-// The hash function is deterministic on the content of the string within the
-// process.
-//
-// Note that the hash function may change from time to time.
-// This functionality will be deprecated and it's recommended to use
-// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
-//
-// Arguments:
-//
-//	num_buckets: The number of buckets.
-//
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets}
-	opspec := tf.OpSpec{
-		Type: "StringToHashBucket",
-		Input: []tf.Input{
-			string_tensor,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// BiasAddGradAttr is an optional argument to BiasAddGrad.
-type BiasAddGradAttr func(optionalAttr)
-
-// BiasAddGradDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the bias tensor will be added to the last dimension
-// of the value tensor.
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// The tensor will be added to "in_channels", the third-to-the-last
-//     dimension.
-// If not specified, defaults to "NHWC"
-func BiasAddGradDataFormat(value string) BiasAddGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// The backward operation for "BiasAdd" on the "bias" tensor.
-//
-// It accumulates all the values from out_backprop into the feature dimension.
-// For NHWC data format, the feature dimension is the last. For NCHW data format,
-// the feature dimension is the third-to-last.
-//
-// Arguments:
-//	out_backprop: Any number of dimensions.
-//
-// Returns 1-D with size the feature dimension of `out_backprop`.
-func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "BiasAddGrad",
-		Input: []tf.Input{
-			out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes inverse hyperbolic cosine of x element-wise.
-func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Acosh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
-//
-// Arguments:
-//
-//	thread_pool: A resource produced by the ThreadPoolHandle op.
-//
-//
-func ExperimentalThreadPoolDataset(scope *Scope, input_dataset tf.Output, thread_pool tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalThreadPoolDataset",
-		Input: []tf.Input{
-			input_dataset, thread_pool,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// OutfeedDequeueTupleAttr is an optional argument to OutfeedDequeueTuple.
-type OutfeedDequeueTupleAttr func(optionalAttr)
-
-// OutfeedDequeueTupleDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. This should be -1 when the Op
-// is running on a TPU device, and >= 0 when the Op is running on the CPU
-// device.
-// If not specified, defaults to -1
-func OutfeedDequeueTupleDeviceOrdinal(value int64) OutfeedDequeueTupleAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// Retrieve multiple values from the computation outfeed.
-//
-// This operation will block indefinitely until data is available. Output `i`
-// corresponds to XLA tuple element `i`.
-//
-// Arguments:
-//	dtypes: The element types of each element in `outputs`.
-//	shapes: The shapes of each tensor in `outputs`.
-//
-// Returns A list of tensors that will be read from the outfeed.
-func OutfeedDequeueTuple(scope *Scope, dtypes []tf.DataType, shapes []tf.Shape, optional ...OutfeedDequeueTupleAttr) (outputs []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes, "shapes": shapes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "OutfeedDequeueTuple",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("OutfeedDequeueTuple", err)
-		return
-	}
-	return outputs
-}
-
-// Computes the sum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output[i] = \sum_{j...} data[j...]\\) where the sum is over tuples `j...` such
-// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
-// need not be sorted and need not cover all values in the full
-// range of valid values.
-//
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
-// If the given segment ID `i` is negative, the value is dropped and will not be
-// added to the sum of the segment.
-//
-// `num_segments` should equal the number of distinct segment IDs.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
-// </div>
-//
-// ``` python
-// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
-// tf.unsorted_segment_sum(c, tf.constant([0, 1, 0]), num_segments=2)
-// # ==> [[ 5,  5, 5, 5],
-// #       [5,  6, 7, 8]]
-// ```
-//
-//
-// Arguments:
-//
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
-//
-//
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentSum",
-		Input: []tf.Input{
-			data, segment_ids, num_segments,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MaxAttr is an optional argument to Max.
-type MaxAttr func(optionalAttr)
-
-// MaxKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MaxKeepDims(value bool) MaxAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the maximum of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Max",
-		Input: []tf.Input{
-			input, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Concatenates quantized tensors along one dimension.
-//
-// Arguments:
-//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [0, rank(values)).
-//	values: The `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	input_mins: The minimum scalar values for each of the input tensors.
-//	input_maxes: The maximum scalar values for each of the input tensors.
-//
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizedConcat(scope *Scope, concat_dim tf.Output, values []tf.Output, input_mins []tf.Output, input_maxes []tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedConcat",
-		Input: []tf.Input{
-			concat_dim, tf.OutputList(values), tf.OutputList(input_mins), tf.OutputList(input_maxes),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
-type MaxPool3DGradAttr func(optionalAttr)
-
-// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of max pooling function.
-//
-// Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPool3DGrad",
-		Input: []tf.Input{
-			orig_input, orig_output, grad,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UnicodeEncodeAttr is an optional argument to UnicodeEncode.
-type UnicodeEncodeAttr func(optionalAttr)
-
-// UnicodeEncodeErrors sets the optional errors attribute to value.
-//
-// value: Error handling policy when there is invalid formatting found in the input.
-// The value of 'strict' will cause the operation to produce a InvalidArgument
-// error on any invalid input formatting. A value of 'replace' (the default) will
-// cause the operation to replace any invalid formatting in the input with the
-// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
-// skip any invalid formatting in the input and produce no corresponding output
-// character.
-// If not specified, defaults to "replace"
-func UnicodeEncodeErrors(value string) UnicodeEncodeAttr {
-	return func(m optionalAttr) {
-		m["errors"] = value
-	}
-}
-
-// UnicodeEncodeReplacementChar sets the optional replacement_char attribute to value.
-//
-// value: The replacement character codepoint to be used in place of any invalid
-// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
-// be used. The default value is the default unicode replacement character is
-// 0xFFFD (U+65533).
-// If not specified, defaults to 65533
-func UnicodeEncodeReplacementChar(value int64) UnicodeEncodeAttr {
-	return func(m optionalAttr) {
-		m["replacement_char"] = value
-	}
-}
-
-// Encode a tensor of ints into unicode strings.
-//
-// Returns a vector of strings, where `output[i]` is constructed by encoding the
-// Unicode codepoints in `input_values[input_splits[i]:input_splits[i+1]]`
-// using `output_encoding`.
-//
-// ---
-//
-// Example:
-//
-// ```
-// input_values = [72, 101, 108, 108, 111, 87, 111, 114, 108, 100]
-// input_splits = [0, 5, 10]
-// output_encoding = 'UTF-8'
-//
-// output = ['Hello', 'World']
-// ```
-//
-// Arguments:
-//	input_values: A 1D tensor containing the unicode codepoints that should be encoded.
-//	input_splits: A 1D tensor specifying how the unicode codepoints should be split into strings.
-// In particular, `output[i]` is constructed by encoding the codepoints in the
-// slice `input_values[input_splits[i]:input_splits[i+1]]`.
-//	output_encoding: Unicode encoding of the output strings. Valid encodings are: `"UTF-8",
-// "UTF-16-BE", and "UTF-32-BE"`.
-//
-// Returns The 1-D Tensor of strings encoded from the provided unicode codepoints.
-func UnicodeEncode(scope *Scope, input_values tf.Output, input_splits tf.Output, output_encoding string, optional ...UnicodeEncodeAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_encoding": output_encoding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UnicodeEncode",
-		Input: []tf.Input{
-			input_values, input_splits,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Converts the given `resource_handle` representing an iterator to a string.
-//
-// Arguments:
-//	resource_handle: A handle to an iterator resource.
-//
-// Returns A string representation of the given handle.
-func IteratorToStringHandle(scope *Scope, resource_handle tf.Output) (string_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IteratorToStringHandle",
-		Input: []tf.Input{
-			resource_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TridiagonalSolveAttr is an optional argument to TridiagonalSolve.
-type TridiagonalSolveAttr func(optionalAttr)
-
-// TridiagonalSolvePartialPivoting sets the optional partial_pivoting attribute to value.
-//
-// value: Whether to apply partial pivoting. Partial pivoting makes the procedure more
-// stable, but slower.
+// SetSizeValidateIndices sets the optional validate_indices attribute to value.
 // If not specified, defaults to true
-func TridiagonalSolvePartialPivoting(value bool) TridiagonalSolveAttr {
+func SetSizeValidateIndices(value bool) SetSizeAttr {
 	return func(m optionalAttr) {
-		m["partial_pivoting"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Solves tridiagonal systems of equations.
+// Number of unique elements along last dimension of input `set`.
 //
-//   Solves tridiagonal systems of equations.
-//   Supports batch dimensions and multiple right-hand sides per each left-hand
-//   side.
-//   On CPU, solution is computed via Gaussian elimination with or without partial
-//   pivoting, depending on `partial_pivoting` attribute. On GPU, Nvidia's cuSPARSE
-//   library is used: https://docs.nvidia.com/cuda/cusparse/index.html#gtsv
+// Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
+// and `set_shape`. The last dimension contains values in a set, duplicates are
+// allowed but ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set`
+// indices.
 //
 // Arguments:
-//	diagonals: Tensor of shape `[..., 3, M]` whose innermost 2 dimensions represent the
-// tridiagonal matrices with three rows being the superdiagonal, diagonals, and
-// subdiagonals, in order. The last element of the superdiagonal and the first
-// element of the subdiagonal is ignored.
-//	rhs: Tensor of shape `[..., M, K]`, representing K right-hand sides per each
-// left-hand side.
+//	set_indices: 2D `Tensor`, indices of a `SparseTensor`.
+//	set_values: 1D `Tensor`, values of a `SparseTensor`.
+//	set_shape: 1D `Tensor`, shape of a `SparseTensor`.
 //
-// Returns Tensor of shape `[..., M, K]` containing the solutions
-func TridiagonalSolve(scope *Scope, diagonals tf.Output, rhs tf.Output, optional ...TridiagonalSolveAttr) (output tf.Output) {
+// Returns For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
+// `n-1` dimensions as `set`. Each value is the number of unique elements in
+// the corresponding `[0...n-1]` dimension of `set`.
+func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shape tf.Output, optional ...SetSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -26437,9 +38269,9 @@ func TridiagonalSolve(scope *Scope, diagonals tf.Output, rhs tf.Output, optional
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TridiagonalSolve",
+		Type: "SetSize",
 		Input: []tf.Input{
-			diagonals, rhs,
+			set_indices, set_values, set_shape,
 		},
 		Attrs: attrs,
 	}
@@ -26447,1951 +38279,25 @@ func TridiagonalSolve(scope *Scope, diagonals tf.Output, rhs tf.Output, optional
 	return op.Output(0)
 }
 
-// ResourceScatterNdAddAttr is an optional argument to ResourceScatterNdAdd.
-type ResourceScatterNdAddAttr func(optionalAttr)
-
-// ResourceScatterNdAddUseLocking sets the optional use_locking attribute to value.
-//
-// value: An optional bool. Defaults to True. If True, the assignment will
-// be protected by a lock; otherwise the behavior is undefined,
-// but may exhibit less contention.
-// If not specified, defaults to true
-func ResourceScatterNdAddUseLocking(value bool) ResourceScatterNdAddAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Applies sparse addition to individual values or slices in a Variable.
-//
-// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `ref`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `ref`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
-// ```
-//
-// For example, say we want to add 4 scattered elements to a rank-1 tensor to
-// 8 elements. In Python, that addition would look like this:
-//
-// ```python
-// ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
-// indices = tf.constant([[4], [3], [1], [7]])
-// updates = tf.constant([9, 10, 11, 12])
-// add = tf.scatter_nd_add(ref, indices, updates)
-// with tf.Session() as sess:
-//   print sess.run(add)
-// ```
-//
-// The resulting update to ref would look like this:
-//
-//     [1, 13, 3, 14, 14, 6, 7, 20]
-//
-// See `tf.scatter_nd` for more details about how to make updates to
-// slices.
+// Restore a Reader to its initial clean state.
 //
 // Arguments:
-//	ref: A resource handle. Must be from a VarHandleOp.
-//	indices: A Tensor. Must be one of the following types: int32, int64.
-// A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as ref. A tensor of
-// values to add to ref.
+//	reader_handle: Handle to a Reader.
 //
 // Returns the created operation.
-func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdAddAttr) (o *tf.Operation) {
+func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterNdAdd",
+		Type: "ReaderResetV2",
 		Input: []tf.Input{
-			ref, indices, updates,
+			reader_handle,
 		},
-		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// ReduceJoinAttr is an optional argument to ReduceJoin.
-type ReduceJoinAttr func(optionalAttr)
-
-// ReduceJoinKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If `True`, retain reduced dimensions with length `1`.
-// If not specified, defaults to false
-func ReduceJoinKeepDims(value bool) ReduceJoinAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// ReduceJoinSeparator sets the optional separator attribute to value.
-//
-// value: The separator to use when joining.
-// If not specified, defaults to ""
-func ReduceJoinSeparator(value string) ReduceJoinAttr {
-	return func(m optionalAttr) {
-		m["separator"] = value
-	}
-}
-
-// Joins a string Tensor across the given dimensions.
-//
-// Computes the string join across dimensions in the given string Tensor of shape
-// `[\\(d_0, d_1, ..., d_{n-1}\\)]`.  Returns a new Tensor created by joining the input
-// strings with the given separator (default: empty string).  Negative indices are
-// counted backwards from the end, with `-1` being equivalent to `n - 1`.  If
-// indices are not specified, joins across all dimensions beginning from `n - 1`
-// through `0`.
-//
-// For example:
-//
-// ```python
-// # tensor `a` is [["a", "b"], ["c", "d"]]
-// tf.reduce_join(a, 0) ==> ["ac", "bd"]
-// tf.reduce_join(a, 1) ==> ["ab", "cd"]
-// tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
-// tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
-// tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
-// tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
-// tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
-// tf.reduce_join(a, [0, 1]) ==> "acbd"
-// tf.reduce_join(a, [1, 0]) ==> "abcd"
-// tf.reduce_join(a, []) ==> [["a", "b"], ["c", "d"]]
-// tf.reduce_join(a) = tf.reduce_join(a, [1, 0]) ==> "abcd"
-// ```
-//
-// Arguments:
-//	inputs: The input to be joined.  All reduced indices must have non-zero size.
-//	reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
-// order specified.  Omitting `reduction_indices` is equivalent to passing
-// `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
-//
-// Returns Has shape equal to that of the input with reduced dimensions removed or
-// set to `1` depending on `keep_dims`.
-func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, optional ...ReduceJoinAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ReduceJoin",
-		Input: []tf.Input{
-			inputs, reduction_indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StatefulStandardNormalV2Attr is an optional argument to StatefulStandardNormalV2.
-type StatefulStandardNormalV2Attr func(optionalAttr)
-
-// StatefulStandardNormalV2Dtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatefulStandardNormalV2Dtype(value tf.DataType) StatefulStandardNormalV2Attr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs random values from a normal distribution.
-//
-// The generated values will have mean 0 and standard deviation 1.
-//
-// Arguments:
-//	resource: The handle of the resource variable that stores the state of the RNG.
-//	algorithm: The RNG algorithm.
-//	shape: The shape of the output tensor.
-//
-// Returns A tensor of the specified shape filled with random normal values.
-func StatefulStandardNormalV2(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulStandardNormalV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatefulStandardNormalV2",
-		Input: []tf.Input{
-			resource, algorithm, shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
-type FusedBatchNormAttr func(optionalAttr)
-
-// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
-
-// FusedBatchNormDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// FusedBatchNormIsTraining sets the optional is_training attribute to value.
-//
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// Batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
-//
-// Arguments:
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	offset: A 1D Tensor for offset, to shift to the normalized x.
-//	mean: A 1D Tensor for population mean. Used for inference only;
-// must be empty for training.
-//	variance: A 1D Tensor for population variance. Used for inference only;
-// must be empty for training.
-//
-// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
-// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
-// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be reused in the gradient computation.
-func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FusedBatchNorm",
-		Input: []tf.Input{
-			x, scale, offset, mean, variance,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
-// Generate a sharded filename. The filename is printf formatted as
-//
-//    %s-%05d-of-%05d, basename, shard, num_shards.
-func ShardedFilename(scope *Scope, basename tf.Output, shard tf.Output, num_shards tf.Output) (filename tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ShardedFilename",
-		Input: []tf.Input{
-			basename, shard, num_shards,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the gradient of morphological 2-D dilation with respect to the filter.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
-//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
-// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
-// Must be: `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 3-D with shape `[filter_height, filter_width, depth]`.
-func Dilation2DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (filter_backprop tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "Dilation2DBackpropFilter",
-		Input: []tf.Input{
-			input, filter, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes fingerprints of the input strings.
-//
-// Arguments:
-//	input: vector of strings to compute fingerprints on.
-//
-// Returns a (N,2) shaped matrix where N is the number of elements in the input
-// vector. Each row contains the low and high parts of the fingerprint.
-func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SdcaFprint",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a TensorList which, when stacked, has the value of `tensor`.
-//
-// Each tensor in the result list corresponds to one row of the input tensor.
-//
-// tensor: The input tensor.
-// output_handle: The list.
-func TensorListFromTensor(scope *Scope, tensor tf.Output, element_shape tf.Output) (output_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListFromTensor",
-		Input: []tf.Input{
-			tensor, element_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
-//
-// The hash function is deterministic on the content of the string within the
-// process. The hash function is a keyed hash function, where attribute `key`
-// defines the key of the hash function. `key` is an array of 2 elements.
-//
-// A strong hash is important when inputs may be malicious, e.g. URLs with
-// additional components. Adversaries could try to make their inputs hash to the
-// same bucket for a denial-of-service attack or to skew the results. A strong
-// hash prevents this by making it difficult, if not infeasible, to compute inputs
-// that hash to the same bucket. This comes at a cost of roughly 4x higher compute
-// time than `tf.string_to_hash_bucket_fast`.
-//
-// Arguments:
-//	input: The strings to assign a hash bucket.
-//	num_buckets: The number of buckets.
-//	key: The key for the keyed hash function passed as a list of two uint64
-// elements.
-//
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64, key []int64) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets, "key": key}
-	opspec := tf.OpSpec{
-		Type: "StringToHashBucketStrong",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Decode web-safe base64-encoded strings.
-//
-// Input may or may not have padding at the end. See EncodeBase64 for padding.
-// Web-safe means that input must use - and _ instead of + and /.
-//
-// Arguments:
-//	input: Base64 strings to decode.
-//
-// Returns Decoded strings.
-func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeBase64",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Determine the script codes of a given tensor of Unicode integer code points.
-//
-// This operation converts Unicode code points to script codes corresponding to
-// each code point. Script codes correspond to International Components for
-// Unicode (ICU) UScriptCode values. See http://icu-project.org/apiref/icu4c/uscript_8h.html.
-// Returns -1 (USCRIPT_INVALID_CODE) for invalid codepoints. Output shape will
-// match input shape.
-//
-// Arguments:
-//	input: A Tensor of int32 Unicode code points.
-//
-// Returns A Tensor of int32 script codes corresponding to each input code point.
-func UnicodeScript(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "UnicodeScript",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that emits the records from one or more TFRecord files.
-//
-// Arguments:
-//	filenames: A scalar or vector containing the name(s) of the file(s) to be
-// read.
-//	compression_type: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-//	buffer_size: A scalar representing the number of bytes to buffer. A value of
-// 0 means no buffering will be performed.
-func TFRecordDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TFRecordDataset",
-		Input: []tf.Input{
-			filenames, compression_type, buffer_size,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SkipgramAttr is an optional argument to Skipgram.
-type SkipgramAttr func(optionalAttr)
-
-// SkipgramWindowSize sets the optional window_size attribute to value.
-//
-// value: The number of words to predict to the left and right of the target.
-// If not specified, defaults to 5
-func SkipgramWindowSize(value int64) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["window_size"] = value
-	}
-}
-
-// SkipgramMinCount sets the optional min_count attribute to value.
-//
-// value: The minimum number of word occurrences for it to be included in the
-// vocabulary.
-// If not specified, defaults to 5
-func SkipgramMinCount(value int64) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["min_count"] = value
-	}
-}
-
-// SkipgramSubsample sets the optional subsample attribute to value.
-//
-// value: Threshold for word occurrence. Words that appear with higher
-// frequency will be randomly down-sampled. Set to 0 to disable.
-// If not specified, defaults to 0.001
-func SkipgramSubsample(value float32) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["subsample"] = value
-	}
-}
-
-// Parses a text file and creates a batch of examples.
-//
-// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
-//
-// Arguments:
-//	filename: The corpus's text file name.
-//	batch_size: The size of produced batch.
-//
-// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
-func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Skipgram",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
-}
-
-// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
-type StatelessRandomNormalAttr func(optionalAttr)
-
-// StatelessRandomNormalDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs deterministic pseudorandom values from a normal distribution.
-//
-// The generated values will have mean 0 and standard deviation 1.
-//
-// The outputs are a deterministic function of `shape` and `seed`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//
-// Returns Random values with specified shape.
-func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessRandomNormal",
-		Input: []tf.Input{
-			shape, seed,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
-type StatelessTruncatedNormalAttr func(optionalAttr)
-
-// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs deterministic pseudorandom values from a truncated normal distribution.
-//
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
-//
-// The outputs are a deterministic function of `shape` and `seed`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//
-// Returns Random values with specified shape.
-func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessTruncatedNormal",
-		Input: []tf.Input{
-			shape, seed,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StagePeekAttr is an optional argument to StagePeek.
-type StagePeekAttr func(optionalAttr)
-
-// StagePeekCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func StagePeekCapacity(value int64) StagePeekAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// StagePeekMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func StagePeekMemoryLimit(value int64) StagePeekAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// StagePeekContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func StagePeekContainer(value string) StagePeekAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// StagePeekSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func StagePeekSharedName(value string) StagePeekAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op peeks at the values at the specified index.  If the
-//
-// underlying container does not contain sufficient elements
-// this op will block until it does.   This Op is optimized for
-// performance.
-func StagePeek(scope *Scope, index tf.Output, dtypes []tf.DataType, optional ...StagePeekAttr) (values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StagePeek",
-		Input: []tf.Input{
-			index,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("StagePeek", err)
-		return
-	}
-	return values
-}
-
-// QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
-type QuantizedMatMulAttr func(optionalAttr)
-
-// QuantizedMatMulToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMatMulToutput(value tf.DataType) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
-}
-
-// QuantizedMatMulTransposeA sets the optional transpose_a attribute to value.
-//
-// value: If true, `a` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulTransposeA(value bool) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
-}
-
-// QuantizedMatMulTransposeB sets the optional transpose_b attribute to value.
-//
-// value: If true, `b` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulTransposeB(value bool) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// QuantizedMatMulTactivation sets the optional Tactivation attribute to value.
-//
-// value: The type of output produced by activation function
-// following this operation.
-// If not specified, defaults to DT_QUINT8
-func QuantizedMatMulTactivation(value tf.DataType) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["Tactivation"] = value
-	}
-}
-
-// Perform a quantized matrix multiplication of  `a` by the matrix `b`.
-//
-// The inputs must be two-dimensional matrices and the inner dimension of
-// `a` (after being transposed if `transpose_a` is non-zero) must match the
-// outer dimension of `b` (after being transposed if `transposed_b` is
-// non-zero).
-//
-// Arguments:
-//	a: Must be a two-dimensional tensor.
-//	b: Must be a two-dimensional tensor.
-//	min_a: The float value that the lowest quantized `a` value represents.
-//	max_a: The float value that the highest quantized `a` value represents.
-//	min_b: The float value that the lowest quantized `b` value represents.
-//	max_b: The float value that the highest quantized `b` value represents.
-//
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedMatMul",
-		Input: []tf.Input{
-			a, b, min_a, max_a, min_b, max_b,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Fast Fourier transform.
-//
-// Computes the 1-dimensional discrete Fourier transform over the inner-most
-// dimension of `input`.
-//
-// Arguments:
-//	input: A complex tensor.
-//
-// Returns A complex tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its 1D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.fft
-// @end_compatibility
-func FFT(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "FFT",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DecodeCSVAttr is an optional argument to DecodeCSV.
-type DecodeCSVAttr func(optionalAttr)
-
-// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
-//
-// value: char delimiter to separate fields in a record.
-// If not specified, defaults to ","
-func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["field_delim"] = value
-	}
-}
-
-// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
-//
-// value: If false, treats double quotation marks as regular
-// characters inside of the string fields (ignoring RFC 4180, Section 2,
-// Bullet 5).
-// If not specified, defaults to true
-func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["use_quote_delim"] = value
-	}
-}
-
-// DecodeCSVNaValue sets the optional na_value attribute to value.
-//
-// value: Additional string to recognize as NA/NaN.
-// If not specified, defaults to ""
-func DecodeCSVNaValue(value string) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["na_value"] = value
-	}
-}
-
-// DecodeCSVSelectCols sets the optional select_cols attribute to value.
-// If not specified, defaults to <>
-func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["select_cols"] = value
-	}
-}
-
-// Convert CSV records to tensors. Each column maps to one tensor.
-//
-// RFC 4180 format is expected for the CSV records.
-// (https://tools.ietf.org/html/rfc4180)
-// Note that we allow leading and trailing spaces with int or float field.
-//
-// Arguments:
-//	records: Each string is a record/row in the csv and all records should have
-// the same format.
-//	record_defaults: One tensor per column of the input record, with either a
-// scalar default value for that column or an empty vector if the column is
-// required.
-//
-// Returns Each tensor will have the same shape as records.
-func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeCSV",
-		Input: []tf.Input{
-			records, tf.OutputList(record_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("DecodeCSV", err)
-		return
-	}
-	return output
-}
-
-// Greedily selects a subset of bounding boxes in descending order of score,
-//
-// pruning away boxes that have high overlaps
-// with previously selected boxes.  Bounding boxes with score less than
-// `score_threshold` are removed. N-by-n overlap values are supplied as square matrix,
-// which allows for defining a custom overlap criterium (eg. intersection over union,
-// intersection over area, etc.).
-//
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//
-//   selected_indices = tf.image.non_max_suppression_with_overlaps(
-//       overlaps, scores, max_output_size, overlap_threshold, score_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
-//
-// Arguments:
-//	overlaps: A 2-D float tensor of shape `[num_boxes, num_boxes]` representing
-// the n-by-n box overlap values.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	overlap_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too.
-//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
-// boxes based on score.
-//
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppressionWithOverlaps(scope *Scope, overlaps tf.Output, scores tf.Output, max_output_size tf.Output, overlap_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionWithOverlaps",
-		Input: []tf.Input{
-			overlaps, scores, max_output_size, overlap_threshold, score_threshold,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes natural logarithm of x element-wise.
-//
-// I.e., \\(y = \log_e x\\).
-func Log(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Log",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// IdentityReaderV2Attr is an optional argument to IdentityReaderV2.
-type IdentityReaderV2Attr func(optionalAttr)
-
-// IdentityReaderV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func IdentityReaderV2Container(value string) IdentityReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// IdentityReaderV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func IdentityReaderV2SharedName(value string) IdentityReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A Reader that outputs the queued work as both the key and value.
-//
-// To use, enqueue strings in a Queue.  ReaderRead will take the front
-// work string and output (work, work).
-//
-// Returns The handle to reference the Reader.
-func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "IdentityReaderV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ExtractGlimpseAttr is an optional argument to ExtractGlimpse.
-type ExtractGlimpseAttr func(optionalAttr)
-
-// ExtractGlimpseCentered sets the optional centered attribute to value.
-//
-// value: indicates if the offset coordinates are centered relative to
-// the image, in which case the (0, 0) offset is relative to the center
-// of the input images. If false, the (0,0) offset corresponds to the
-// upper left corner of the input images.
-// If not specified, defaults to true
-func ExtractGlimpseCentered(value bool) ExtractGlimpseAttr {
-	return func(m optionalAttr) {
-		m["centered"] = value
-	}
-}
-
-// ExtractGlimpseNormalized sets the optional normalized attribute to value.
-//
-// value: indicates if the offset coordinates are normalized.
-// If not specified, defaults to true
-func ExtractGlimpseNormalized(value bool) ExtractGlimpseAttr {
-	return func(m optionalAttr) {
-		m["normalized"] = value
-	}
-}
-
-// ExtractGlimpseUniformNoise sets the optional uniform_noise attribute to value.
-//
-// value: indicates if the noise should be generated using a
-// uniform distribution or a Gaussian distribution.
-// If not specified, defaults to true
-func ExtractGlimpseUniformNoise(value bool) ExtractGlimpseAttr {
-	return func(m optionalAttr) {
-		m["uniform_noise"] = value
-	}
-}
-
-// ExtractGlimpseNoise sets the optional noise attribute to value.
-//
-// value: indicates if the noise should `uniform`, `gaussian`, or
-// `zero`. The default is `uniform` which means the the noise type
-// will be decided by `uniform_noise`.
-// If not specified, defaults to "uniform"
-func ExtractGlimpseNoise(value string) ExtractGlimpseAttr {
-	return func(m optionalAttr) {
-		m["noise"] = value
-	}
-}
-
-// Extracts a glimpse from the input tensor.
-//
-// Returns a set of windows called glimpses extracted at location
-// `offsets` from the input tensor. If the windows only partially
-// overlaps the inputs, the non overlapping areas will be filled with
-// random noise.
-//
-// The result is a 4-D tensor of shape `[batch_size, glimpse_height,
-// glimpse_width, channels]`. The channels and batch dimensions are the
-// same as that of the input tensor. The height and width of the output
-// windows are specified in the `size` parameter.
-//
-// The argument `normalized` and `centered` controls how the windows are built:
-//
-// * If the coordinates are normalized but not centered, 0.0 and 1.0
-//   correspond to the minimum and maximum of each height and width
-//   dimension.
-// * If the coordinates are both normalized and centered, they range from
-//   -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
-//   left corner, the lower right corner is located at (1.0, 1.0) and the
-//   center is at (0, 0).
-// * If the coordinates are not normalized they are interpreted as
-//   numbers of pixels.
-//
-// Arguments:
-//	input: A 4-D float tensor of shape `[batch_size, height, width, channels]`.
-//	size: A 1-D tensor of 2 elements containing the size of the glimpses
-// to extract.  The glimpse height must be specified first, following
-// by the glimpse width.
-//	offsets: A 2-D integer tensor of shape `[batch_size, 2]` containing
-// the y, x locations of the center of each window.
-//
-// Returns A tensor representing the glimpses `[batch_size,
-// glimpse_height, glimpse_width, channels]`.
-func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Output, optional ...ExtractGlimpseAttr) (glimpse tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ExtractGlimpse",
-		Input: []tf.Input{
-			input, size, offsets,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Outputs a `Summary` protocol buffer with a histogram.
-//
-// The generated
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// has one summary value containing a histogram for `values`.
-//
-// This op reports an `InvalidArgument` error if any value is not finite.
-//
-// Arguments:
-//	tag: Scalar.  Tag to use for the `Summary.Value`.
-//	values: Any shape. Values to use to build the histogram.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "HistogramSummary",
-		Input: []tf.Input{
-			tag, values,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Fills empty rows in the input 2-D `SparseTensor` with a default value.
-//
-// The input `SparseTensor` is represented via the tuple of inputs
-// (`indices`, `values`, `dense_shape`).  The output `SparseTensor` has the
-// same `dense_shape` but with indices `output_indices` and values
-// `output_values`.
-//
-// This op inserts a single entry for every row that doesn't have any values.
-// The index is created as `[row, 0, ..., 0]` and the inserted value
-// is `default_value`.
-//
-// For example, suppose `sp_input` has shape `[5, 6]` and non-empty values:
-//
-//     [0, 1]: a
-//     [0, 3]: b
-//     [2, 0]: c
-//     [3, 1]: d
-//
-// Rows 1 and 4 are empty, so the output will be of shape `[5, 6]` with values:
-//
-//     [0, 1]: a
-//     [0, 3]: b
-//     [1, 0]: default_value
-//     [2, 0]: c
-//     [3, 1]: d
-//     [4, 0]: default_value
-//
-// The output `SparseTensor` will be in row-major order and will have the
-// same shape as the input.
-//
-// This op also returns an indicator vector shaped `[dense_shape[0]]` such that
-//
-//     empty_row_indicator[i] = True iff row i was an empty row.
-//
-// And a reverse index map vector shaped `[indices.shape[0]]` that is used during
-// backpropagation,
-//
-//     reverse_index_map[j] = out_j s.t. indices[j, :] == output_indices[out_j, :]
-//
-// Arguments:
-//	indices: 2-D. the indices of the sparse tensor.
-//	values: 1-D. the values of the sparse tensor.
-//	dense_shape: 1-D. the shape of the sparse tensor.
-//	default_value: 0-D. default value to insert into location `[row, 0, ..., 0]`
-//   for rows missing from the input sparse tensor.
-// output indices: 2-D. the indices of the filled sparse tensor.
-//
-// Returns 1-D. the values of the filled sparse tensor.1-D. whether the dense row was missing in the
-// input sparse tensor.1-D. a map from the input indices to the output indices.
-func SparseFillEmptyRows(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, default_value tf.Output) (output_indices tf.Output, output_values tf.Output, empty_row_indicator tf.Output, reverse_index_map tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseFillEmptyRows",
-		Input: []tf.Input{
-			indices, values, dense_shape, default_value,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
-type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
-
-// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Learning rate. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagradDA",
-		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
-//
-// This Op does not require `a_indices` be sorted in standard lexicographic order.
-//
-// Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
-//	b: `ndims`-D Tensor.  With shape `a_shape`.
-func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseAdd",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Increments variable pointed to by 'resource' until it reaches 'limit'.
-//
-// Arguments:
-//	resource: Should be from a scalar `Variable` node.
-//	limit: If incrementing ref would bring it above limit, instead generates an
-// 'OutOfRange' error.
-//
-//
-// Returns A copy of the input before increment. If nothing else modifies the
-// input, the values produced will all be distinct.
-func ResourceCountUpTo(scope *Scope, resource tf.Output, limit int64, T tf.DataType) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"limit": limit, "T": T}
-	opspec := tf.OpSpec{
-		Type: "ResourceCountUpTo",
-		Input: []tf.Input{
-			resource,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes rectified linear gradients for a Relu operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Relu operation.
-//	features: The features passed as input to the corresponding Relu operation, OR
-// the outputs of that operation (both work equivalently).
-//
-// Returns `gradients * (features > 0)`.
-func ReluGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReluGrad",
-		Input: []tf.Input{
-			gradients, features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
-type QuantizedConv2DAttr func(optionalAttr)
-
-// QuantizedConv2DOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// QuantizedConv2DDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes a 2D convolution given quantized 4D input and filter tensors.
-//
-// The inputs are quantized tensors where the lowest value represents the real
-// number of the associated minimum, and the highest represents the maximum.
-// This means that you can only interpret the quantized output in the same way, by
-// taking the returned minimum and maximum values into account.
-//
-// Arguments:
-//
-//	filter: filter's input_depth dimension must match input's depth dimensions.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_filter: The float value that the lowest quantized filter value represents.
-//	max_filter: The float value that the highest quantized filter value represents.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedConv2D",
-		Input: []tf.Input{
-			input, filter, min_input, max_input, min_filter, max_filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// MfccAttr is an optional argument to Mfcc.
-type MfccAttr func(optionalAttr)
-
-// MfccUpperFrequencyLimit sets the optional upper_frequency_limit attribute to value.
-//
-// value: The highest frequency to use when calculating the
-// ceptstrum.
-// If not specified, defaults to 4000
-func MfccUpperFrequencyLimit(value float32) MfccAttr {
-	return func(m optionalAttr) {
-		m["upper_frequency_limit"] = value
-	}
-}
-
-// MfccLowerFrequencyLimit sets the optional lower_frequency_limit attribute to value.
-//
-// value: The lowest frequency to use when calculating the
-// ceptstrum.
-// If not specified, defaults to 20
-func MfccLowerFrequencyLimit(value float32) MfccAttr {
-	return func(m optionalAttr) {
-		m["lower_frequency_limit"] = value
-	}
-}
-
-// MfccFilterbankChannelCount sets the optional filterbank_channel_count attribute to value.
-//
-// value: Resolution of the Mel bank used internally.
-// If not specified, defaults to 40
-func MfccFilterbankChannelCount(value int64) MfccAttr {
-	return func(m optionalAttr) {
-		m["filterbank_channel_count"] = value
-	}
-}
-
-// MfccDctCoefficientCount sets the optional dct_coefficient_count attribute to value.
-//
-// value: How many output channels to produce per time slice.
-// If not specified, defaults to 13
-func MfccDctCoefficientCount(value int64) MfccAttr {
-	return func(m optionalAttr) {
-		m["dct_coefficient_count"] = value
-	}
-}
-
-// Transforms a spectrogram into a form that's useful for speech recognition.
-//
-// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
-// been effective as an input feature for machine learning. They are created by
-// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
-// higher frequencies that are less significant to the human ear. They have a long
-// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
-// is a good resource to learn more.
-//
-// Arguments:
-//	spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
-// set to true.
-//	sample_rate: How many samples per second the source audio used.
-func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional ...MfccAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Mfcc",
-		Input: []tf.Input{
-			spectrogram, sample_rate,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Inverse real-valued fast Fourier transform.
-//
-// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most dimension of `input`.
-//
-// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
-// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
-// `fft_length` is not provided, it is computed from the size of the inner-most
-// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
-// compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
-//
-// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
-// than the corresponding dimension of `input`, the dimension is cropped. If it is
-// larger, the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
-//
-// Returns A float32 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length` samples of its inverse
-//   1D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft
-// @end_compatibility
-func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IRFFT",
-		Input: []tf.Input{
-			input, fft_length,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ConfigureDistributedTPUAttr is an optional argument to ConfigureDistributedTPU.
-type ConfigureDistributedTPUAttr func(optionalAttr)
-
-// ConfigureDistributedTPUEmbeddingConfig sets the optional embedding_config attribute to value.
-//
-// value: Reserved. Do not use.
-// If not specified, defaults to ""
-func ConfigureDistributedTPUEmbeddingConfig(value string) ConfigureDistributedTPUAttr {
-	return func(m optionalAttr) {
-		m["embedding_config"] = value
-	}
-}
-
-// ConfigureDistributedTPUTpuEmbeddingConfig sets the optional tpu_embedding_config attribute to value.
-//
-// value: Serialized tensorflow.tpu.TPUEmbeddingConfiguration that
-// describes the embedding lookups of the program.
-// If not specified, defaults to ""
-func ConfigureDistributedTPUTpuEmbeddingConfig(value string) ConfigureDistributedTPUAttr {
-	return func(m optionalAttr) {
-		m["tpu_embedding_config"] = value
-	}
-}
-
-// ConfigureDistributedTPUIsGlobalInit sets the optional is_global_init attribute to value.
-//
-// value: Reserved. Do not use.
-// If not specified, defaults to false
-func ConfigureDistributedTPUIsGlobalInit(value bool) ConfigureDistributedTPUAttr {
-	return func(m optionalAttr) {
-		m["is_global_init"] = value
-	}
-}
-
-// Sets up the centralized structures for a distributed TPU system.
-//
-// Returns A serialized tensorflow.tpu.TopologyProto that describes the TPU
-// topology.
-func ConfigureDistributedTPU(scope *Scope, optional ...ConfigureDistributedTPUAttr) (topology tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ConfigureDistributedTPU",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
-//
-// N is the size of the segment being reduced.
-//
-// See `tf.sparse.segment_sum` for usage examples.
-//
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtN",
-		Input: []tf.Input{
-			data, indices, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Conv3DBackpropFilterAttr is an optional argument to Conv3DBackpropFilter.
-type Conv3DBackpropFilterAttr func(optionalAttr)
-
-// Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of 3-D convolution with respect to the filter.
-//
-// DEPRECATED at GraphDef version 10: Use Conv3DBackpropFilterV2
-//
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropFilter",
-		Input: []tf.Input{
-			input, filter, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Does nothing. Serves as a control trigger for scheduling.
-//
-// Only useful as a placeholder for control edges.
-//
-// Returns the created operation.
-func ControlTrigger(scope *Scope) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ControlTrigger",
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Creates a MultiDeviceIterator resource.
-//
-// Arguments:
-//	devices: A list of devices the iterator works across.
-//	shared_name: If non-empty, this resource will be shared under the given name
-// across multiple sessions.
-//	container: If non-empty, this resource is placed in the given container.
-// Otherwise, a default container is used.
-//	output_types: The type list for the return values.
-//	output_shapes: The list of shapes being produced.
-//
-// Returns Handle to the resource created.
-func MultiDeviceIterator(scope *Scope, devices []string, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"devices": devices, "shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "MultiDeviceIterator",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes rectified linear 6: `min(max(features, 0), 6)`.
-func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Relu6",
-		Input: []tf.Input{
-			features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Conv3DAttr is an optional argument to Conv3D.
-type Conv3DAttr func(optionalAttr)
-
-// Conv3DDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DDataFormat(value string) Conv3DAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv3DDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DDilations(value []int64) Conv3DAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes a 3-D convolution given 5-D `input` and `filter` tensors.
-//
-// In signal processing, cross-correlation is a measure of similarity of
-// two waveforms as a function of a time-lag applied to one of them. This
-// is also known as a sliding dot product or sliding inner-product.
-//
-// Our Conv3D implements a form of cross-correlation.
-//
-// Arguments:
-//	input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
-//	filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
-// out_channels]`. `in_channels` must match between `input` and `filter`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv3DAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv3D",
-		Input: []tf.Input{
-			input, filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
-type AvgPoolGradAttr func(optionalAttr)
-
-// AvgPoolGradDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of the average pooling function.
-//
-// Arguments:
-//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
-// the output of `avg_pool`.
-//	ksize: The size of the sliding window for each dimension of the input.
-//	strides: The stride of the sliding window for each dimension of the input.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
-func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AvgPoolGrad",
-		Input: []tf.Input{
-			orig_input_shape, grad,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// 3D real-valued fast Fourier transform.
-//
-// Computes the 3-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most 3 dimensions of `input`.
-//
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
-// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-// of `output`: the zero-frequency term, followed by the `fft_length / 2`
-// positive-frequency terms.
-//
-// Along each axis `RFFT3D` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
-//
-// Returns A complex64 tensor of the same rank as `input`. The inner-most 3
-//   dimensions of `input` are replaced with the their 3D Fourier transform. The
-//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
-//   components.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.rfftn with 3 dimensions.
-// @end_compatibility
-func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RFFT3D",
-		Input: []tf.Input{
-			input, fft_length,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MaxPoolGradV2Attr is an optional argument to MaxPoolGradV2.
-type MaxPoolGradV2Attr func(optionalAttr)
-
-// MaxPoolGradV2DataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradV2DataFormat(value string) MaxPoolGradV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of the maxpooling function.
-//
-// Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns Gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolGradV2",
-		Input: []tf.Input{
-			orig_input, orig_output, grad, ksize, strides,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
-type FixedLengthRecordReaderV2Attr func(optionalAttr)
-
-// FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
-//
-// value: Number of bytes in the header, defaults to 0.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["header_bytes"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
-//
-// value: Number of bytes in the footer, defaults to 0.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["footer_bytes"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
-//
-// value: Number of bytes to hop before each read. Default of 0 means using
-// record_bytes.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["hop_bytes"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2Container(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2Encoding sets the optional encoding attribute to value.
-//
-// value: The type of encoding for the file. Currently ZLIB and GZIP
-// are supported. Defaults to none.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2Encoding(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["encoding"] = value
-	}
-}
-
-// A Reader that outputs fixed-length records from a file.
-//
-// Arguments:
-//	record_bytes: Number of bytes in the record.
-//
-// Returns The handle to reference the Reader.
-func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"record_bytes": record_bytes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FixedLengthRecordReaderV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Reads and outputs the entire contents of the input filename.
 func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
 	if scope.Err() != nil {
@@ -28407,1319 +38313,208 @@ func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor.
+// Writes contents to the file at input filename. Creates file and recursively
 //
-// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
+// creates directory if not existing.
 //
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/sparse#Segmentation)
-// for an explanation of segments.
+// Arguments:
+//	filename: scalar. The name of the file to which we write the contents.
+//	contents: scalar. The content to be written to the output file.
 //
-// For example:
+// Returns the created operation.
+func WriteFile(scope *Scope, filename tf.Output, contents tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "WriteFile",
+		Input: []tf.Input{
+			filename, contents,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ResizeBicubicAttr is an optional argument to ResizeBicubic.
+type ResizeBicubicAttr func(optionalAttr)
+
+// ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// ResizeBicubicHalfPixelCenters sets the optional half_pixel_centers attribute to value.
+// If not specified, defaults to false
+func ResizeBicubicHalfPixelCenters(value bool) ResizeBicubicAttr {
+	return func(m optionalAttr) {
+		m["half_pixel_centers"] = value
+	}
+}
+
+// Resize `images` to `size` using bicubic interpolation.
+//
+// Input images can be of different types but output images are always float.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBicubicAttr) (resized_images tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeBicubic",
+		Input: []tf.Input{
+			images, size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResizeBilinearAttr is an optional argument to ResizeBilinear.
+type ResizeBilinearAttr func(optionalAttr)
+
+// ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// ResizeBilinearHalfPixelCenters sets the optional half_pixel_centers attribute to value.
+// If not specified, defaults to false
+func ResizeBilinearHalfPixelCenters(value bool) ResizeBilinearAttr {
+	return func(m optionalAttr) {
+		m["half_pixel_centers"] = value
+	}
+}
+
+// Resize `images` to `size` using bilinear interpolation.
+//
+// Input images can be of different types but output images are always float.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBilinearAttr) (resized_images tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeBilinear",
+		Input: []tf.Input{
+			images, size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Gather ragged slices from `params` axis `0` according to `indices`.
+//
+// Outputs a `RaggedTensor` output composed from `output_dense_values` and
+// `output_nested_splits`, such that:
 //
 // ```python
-// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
-//
-// tf.sparse_segment_sum_with_num_segments(
-//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
-// # => [[0 0 0 0]
-// #     [0 0 0 0]
-// #     [0 0 0 0]]
-//
-// tf.sparse_segment_sum_with_num_segments(c,
-//                                         tf.constant([0, 1]),
-//                                         tf.constant([0, 2],
-//                                         num_segments=4))
-// # => [[ 1  2  3  4]
-// #     [ 0  0  0  0]
-// #     [-1 -2 -3 -4]
-// #     [ 0  0  0  0]]
+// output.shape = indices.shape + params.shape[1:]
+// output.ragged_rank = indices.shape.ndims + params.ragged_rank
+// output[i...j, d0...dn] = params[indices[i...j], d0...dn]
 // ```
 //
+// where
+//
+// * `params =
+//    ragged.from_nested_row_splits(params_dense_values, params_nested_splits)`
+//    provides the values that should be gathered.
+// * `indices` ia a dense tensor with dtype `int32` or `int64`, indicating which
+//    values should be gathered.
+// * `output =
+//    ragged.from_nested_row_splits(output_dense_values, output_nested_splits)`
+//    is the output tensor.
+//
+// (Note: This c++ op is used to implement the higher-level python
+// `tf.ragged.gather` op, which also supports ragged indices.)
+//
+//
 // Arguments:
+//	params_nested_splits: The `nested_row_splits` tensors that define the row-partitioning for the
+// `params` RaggedTensor input.
+//	params_dense_values: The `flat_values` for the `params` RaggedTensor. There was a terminology change
+// at the python level from dense_values to flat_values, so dense_values is the
+// deprecated name.
+//	indices: Indices in the outermost dimension of `params` of the values that should be
+// gathered.
+//	OUTPUT_RAGGED_RANK: The ragged rank of the output RaggedTensor. `output_nested_splits` will contain
+// this number of `row_splits` tensors. This value should equal
+// `indices.shape.ndims + params.ragged_rank - 1`.
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `num_segments`.
-func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// Returns The `nested_row_splits` tensors that define the row-partitioning for the
+// returned RaggedTensor.The `flat_values` for the returned RaggedTensor.
+func RaggedGather(scope *Scope, params_nested_splits []tf.Output, params_dense_values tf.Output, indices tf.Output, OUTPUT_RAGGED_RANK int64) (output_nested_splits []tf.Output, output_dense_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"OUTPUT_RAGGED_RANK": OUTPUT_RAGGED_RANK}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSumWithNumSegments",
+		Type: "RaggedGather",
 		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LoadTPUEmbeddingADAMParametersAttr is an optional argument to LoadTPUEmbeddingADAMParameters.
-type LoadTPUEmbeddingADAMParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingADAMParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingADAMParametersTableId(value int64) LoadTPUEmbeddingADAMParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingADAMParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingADAMParametersTableName(value string) LoadTPUEmbeddingADAMParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load ADAM embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the ADAM optimization algorithm.
-//	momenta: Value of momenta used in the ADAM optimization algorithm.
-//	velocities: Value of velocities used in the ADAM optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingADAMParameters(scope *Scope, parameters tf.Output, momenta tf.Output, velocities tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingADAMParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingADAMParameters",
-		Input: []tf.Input{
-			parameters, momenta, velocities,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg.
-type DecodeAndCropJpegAttr func(optionalAttr)
-
-// DecodeAndCropJpegChannels sets the optional channels attribute to value.
-//
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodeAndCropJpegRatio sets the optional ratio attribute to value.
-//
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["ratio"] = value
-	}
-}
-
-// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
-//
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
-// If not specified, defaults to true
-func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
-	}
-}
-
-// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
-//
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
-	}
-}
-
-// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
-//
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
-	}
-}
-
-// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value.
-//
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
-// If not specified, defaults to ""
-func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["dct_method"] = value
-	}
-}
-
-// Decode and Crop a JPEG-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-//
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
-//
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
-//
-//
-// It is equivalent to a combination of decode and crop, but much faster by only
-// decoding partial jpeg image.
-//
-// Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
-//	crop_window: 1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
-//
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeAndCropJpeg",
-		Input: []tf.Input{
-			contents, crop_window,
+			tf.OutputList(params_nested_splits), params_dense_values, indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StageClearAttr is an optional argument to StageClear.
-type StageClearAttr func(optionalAttr)
-
-// StageClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func StageClearCapacity(value int64) StageClearAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// StageClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func StageClearMemoryLimit(value int64) StageClearAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// StageClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func StageClearContainer(value string) StageClearAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// StageClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func StageClearSharedName(value string) StageClearAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes all elements in the underlying container.
-//
-// Returns the created operation.
-func StageClear(scope *Scope, dtypes []tf.DataType, optional ...StageClearAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
+	var idx int
+	var err error
+	if output_nested_splits, idx, err = makeOutputList(op, idx, "output_nested_splits"); err != nil {
+		scope.UpdateErr("RaggedGather", err)
+		return
 	}
-	opspec := tf.OpSpec{
-		Type: "StageClear",
-
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
+	output_dense_values = op.Output(idx)
+	return output_nested_splits, output_dense_values
 }
 
-// QuantizedDepthwiseConv2DWithBiasAttr is an optional argument to QuantizedDepthwiseConv2DWithBias.
-type QuantizedDepthwiseConv2DWithBiasAttr func(optionalAttr)
-
-// QuantizedDepthwiseConv2DWithBiasOutType sets the optional out_type attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_QINT32
-func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwiseConv2DWithBiasAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
-//
-// value: List of dilation values.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes quantized depthwise Conv2D with Bias.
-//
-// Arguments:
-//	input: The original input tensor.
-//	filter: The original filter tensor.
-//	bias: The original bias tensor.
-//	min_input: The float value that the minimum quantized input value represents.
-//	max_input: The float value that the maximum quantized input value represents.
-//	min_filter: The float value that the minimum quantized filter value represents.
-//	max_filter: The float value that the maximum quantized filter value represents.
-//	strides: List of stride values.
-//
-//
-// Returns The output tensor.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizedDepthwiseConv2DWithBias(scope *Scope, input tf.Output, filter tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedDepthwiseConv2DWithBiasAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+// Creates a dataset that splits a SparseTensor into elements row-wise.
+func SparseTensorSliceDataset(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedDepthwiseConv2DWithBias",
+		Type: "SparseTensorSliceDataset",
 		Input: []tf.Input{
-			input, filter, bias, min_input, max_input, min_filter, max_filter,
+			indices, values, dense_shape,
 		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Elementwise computes the bitwise right-shift of `x` and `y`.
-//
-// Performs a logical shift for unsigned integer types, and an arithmetic shift
-// for signed integer types.
-//
-// If `y` is negative, or greater than or equal to than the width of `x` in bits
-// the result is implementation defined.
-func RightShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RightShift",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr is an optional argument to QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize.
-type QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr func(optionalAttr)
-
-// QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType sets the optional out_type attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_QUINT8
-func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataType) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
-//
-// value: List of dilation values.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes quantized depthwise Conv2D with Bias, Relu and Requantize.
-//
-// Arguments:
-//	input: The original input tensor.
-//	filter: The original filter tensor.
-//	bias: The original bias tensor.
-//	min_input: The float value that the minimum quantized input value represents.
-//	max_input: The float value that the maximum quantized input value represents.
-//	min_filter: The float value that the minimum quantized filter value represents.
-//	max_filter: The float value that the maximum quantized filter value represents.
-//	min_freezed_output: The minimum float value of the output tensor.
-//	max_freezed_output: The maximum float value of the output tensor.
-//	strides: List of stride values.
-//
-//
-// Returns The output tensor.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize(scope *Scope, input tf.Output, filter tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, min_freezed_output tf.Output, max_freezed_output tf.Output, strides []int64, padding string, optional ...QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize",
-		Input: []tf.Input{
-			input, filter, bias, min_input, max_input, min_filter, max_filter, min_freezed_output, max_freezed_output,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Reduces `input` from `num_devices` using `reduction` to a single device.
-//
-// Reduces `input` from `num_devices` using `reduction` to a single device.
-//
-// The graph should be constructed so that all inputs have a valid device
-// assignment, and the op itself is assigned one of these devices.
-//
-// input: The input to the reduction.
-// data: the value of the reduction across all `num_devices` devices.
-// reduction: the reduction operation to perform.
-func NcclReduce(scope *Scope, input []tf.Output, reduction string) (data tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"reduction": reduction}
-	opspec := tf.OpSpec{
-		Type: "NcclReduce",
-		Input: []tf.Input{
-			tf.OutputList(input),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-//     Subtracts `v` into specified rows of `x`.
-//
-//     Computes y = x; y[i, :] -= v; return y.
-//
-// Arguments:
-//	x: A `Tensor` of type T.
-//	i: A vector. Indices into the left-most dimension of `x`.
-//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
-//
-// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
-func InplaceSub(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InplaceSub",
-		Input: []tf.Input{
-			x, i, v,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the element-wise sum of a list of tensors.
-//
-// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
-// wait for all of its inputs to be ready before beginning to sum. This can
-// save memory if inputs are ready at different times, since minimum temporary
-// storage is proportional to the output size rather than the inputs size.
-//
-// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
-//
-// Returns a `Tensor` of same shape and type as the elements of `inputs`.
-//
-// Arguments:
-//	inputs: A list of `Tensor` objects, each with same shape and type.
-//	shape: Shape of elements of `inputs`.
-func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shape": shape}
-	opspec := tf.OpSpec{
-		Type: "AccumulateNV2",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Rounds the values of a tensor to the nearest integer, element-wise.
-//
-// Rounds half to even.  Also known as bankers rounding. If you want to round
-// according to the current system rounding mode use std::cint.
-func Round(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Round",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// BatchMatMulAttr is an optional argument to BatchMatMul.
-type BatchMatMulAttr func(optionalAttr)
-
-// BatchMatMulAdjX sets the optional adj_x attribute to value.
-//
-// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulAdjX(value bool) BatchMatMulAttr {
-	return func(m optionalAttr) {
-		m["adj_x"] = value
-	}
-}
-
-// BatchMatMulAdjY sets the optional adj_y attribute to value.
-//
-// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulAdjY(value bool) BatchMatMulAttr {
-	return func(m optionalAttr) {
-		m["adj_y"] = value
-	}
-}
-
-// Multiplies slices of two tensors in batches.
-//
-// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
-// viewed as an element of a batch), and arranges the individual results
-// in a single output tensor of the same batch size. Each of the
-// individual slices can optionally be adjointed (to adjoint a matrix
-// means to transpose and conjugate it) before multiplication by setting
-// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
-//
-// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
-// and `[..., r_y, c_y]`.
-//
-// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
-//
-//     r_o = c_x if adj_x else r_x
-//     c_o = r_y if adj_y else c_y
-//
-// It is computed as:
-//
-//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
-//
-// Arguments:
-//	x: 2-D or higher with shape `[..., r_x, c_x]`.
-//	y: 2-D or higher with shape `[..., r_y, c_y]`.
-//
-// Returns 3-D or higher with shape `[..., r_o, c_o]`
-func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "BatchMatMul",
-		Input: []tf.Input{
-			x, y,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingFTRLParametersGradAccumDebug.
-type LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr func(optionalAttr)
-
-// LoadTPUEmbeddingFTRLParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingFTRLParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingFTRLParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingFTRLParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load FTRL embedding parameters with debug support.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the FTRL optimization algorithm.
-//	accumulators: Value of accumulators used in the FTRL optimization algorithm.
-//	linears: Value of linears used in the FTRL optimization algorithm.
-//	gradient_accumulators: Value of gradient_accumulators used in the FTRL optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingFTRLParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, linears tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingFTRLParametersGradAccumDebug",
-		Input: []tf.Input{
-			parameters, accumulators, linears, gradient_accumulators,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
-type AddSparseToTensorsMapAttr func(optionalAttr)
-
-// AddSparseToTensorsMapContainer sets the optional container attribute to value.
-//
-// value: The container name for the `SparseTensorsMap` created by this op.
-// If not specified, defaults to ""
-func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
-//
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
-// If not specified, defaults to ""
-func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
-//
-// A `SparseTensor` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`.
-//
-// This operator takes the given `SparseTensor` and adds it to a container
-// object (a `SparseTensorsMap`).  A unique key within this container is generated
-// in the form of an `int64`, and this is the value that is returned.
-//
-// The `SparseTensor` can then be read out as part of a minibatch by passing
-// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddSparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
-//
-// Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
-//
-// Returns 0-D.  The handle of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.
-func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AddSparseToTensorsMap",
-		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// BatchMatMulV2Attr is an optional argument to BatchMatMulV2.
-type BatchMatMulV2Attr func(optionalAttr)
-
-// BatchMatMulV2AdjX sets the optional adj_x attribute to value.
-//
-// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulV2AdjX(value bool) BatchMatMulV2Attr {
-	return func(m optionalAttr) {
-		m["adj_x"] = value
-	}
-}
-
-// BatchMatMulV2AdjY sets the optional adj_y attribute to value.
-//
-// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulV2AdjY(value bool) BatchMatMulV2Attr {
-	return func(m optionalAttr) {
-		m["adj_y"] = value
-	}
-}
-
-// Multiplies slices of two tensors in batches.
-//
-// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
-// viewed as an element of a batch), and arranges the individual results
-// in a single output tensor of the same batch size. Each of the
-// individual slices can optionally be adjointed (to adjoint a matrix
-// means to transpose and conjugate it) before multiplication by setting
-// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
-//
-// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
-// and `[..., r_y, c_y]`.
-//
-// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
-//
-//     r_o = c_x if adj_x else r_x
-//     c_o = r_y if adj_y else c_y
-//
-// It is computed as:
-//
-//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
-//
-// *NOTE*: `BatchMatMulV2` supports broadcasting in the batch dimensions. More
-// about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html).
-//
-//
-// Arguments:
-//	x: 2-D or higher with shape `[..., r_x, c_x]`.
-//	y: 2-D or higher with shape `[..., r_y, c_y]`.
-//
-// Returns 3-D or higher with shape `[..., r_o, c_o]`
-func BatchMatMulV2(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "BatchMatMulV2",
-		Input: []tf.Input{
-			x, y,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CastAttr is an optional argument to Cast.
-type CastAttr func(optionalAttr)
-
-// CastTruncate sets the optional Truncate attribute to value.
-// If not specified, defaults to false
-func CastTruncate(value bool) CastAttr {
-	return func(m optionalAttr) {
-		m["Truncate"] = value
-	}
-}
-
-// Cast x of type SrcT to y of DstT.
-func Cast(scope *Scope, x tf.Output, DstT tf.DataType, optional ...CastAttr) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"DstT": DstT}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Cast",
-		Input: []tf.Input{
-			x,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes numerical negative value element-wise.
-//
-// I.e., \\(y = -x\\).
-func Neg(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Neg",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Read an element from the TensorArray into output `value`.
-//
-// Arguments:
-//	handle: The handle to a TensorArray.
-//
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
-//
-// Returns The tensor that is read from the TensorArray.
-func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayReadV3",
-		Input: []tf.Input{
-			handle, index, flow_in,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the minimum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// This operator is similar to the unsorted segment sum operator found
-// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-// Instead of computing the sum over segments, it computes the minimum such that:
-//
-// \\(output_i = \min_{j...} data_[j...]\\) where min is over tuples `j...` such
-// that `segment_ids[j...] == i`.
-//
-// If the minimum is empty for a given segment ID `i`, it outputs the largest
-// possible value for the specific numeric type,
-// `output[i] = numeric_limits<T>::max()`.
-//
-// For example:
-//
-// ``` python
-// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
-// tf.unsorted_segment_min(c, tf.constant([0, 1, 0]), num_segments=2)
-// # ==> [[ 1,  2, 2, 1],
-// #       [5,  6, 7, 8]]
-// ```
-//
-// If the given segment ID `i` is negative, then the corresponding value is
-// dropped, and will not be included in the result.
-//
-// Arguments:
-//
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
-//
-//
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentMin",
-		Input: []tf.Input{
-			data, segment_ids, num_segments,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LuAttr is an optional argument to Lu.
-type LuAttr func(optionalAttr)
-
-// LuOutputIdxType sets the optional output_idx_type attribute to value.
-// If not specified, defaults to DT_INT32
-func LuOutputIdxType(value tf.DataType) LuAttr {
-	return func(m optionalAttr) {
-		m["output_idx_type"] = value
-	}
-}
-
-// Computes the LU decomposition of one or more square matrices.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices.
-//
-// The input has to be invertible.
-//
-// The output consists of two tensors LU and P containing the LU decomposition
-// of all input submatrices `[..., :, :]`. LU encodes the lower triangular and
-// upper triangular factors.
-//
-// For each input submatrix of shape `[M, M]`, L is a lower triangular matrix of
-// shape `[M, M]` with unit diagonal whose entries correspond to the strictly lower
-// triangular part of LU. U is a upper triangular matrix of shape `[M, M]` whose
-// entries correspond to the upper triangular part, including the diagonal, of LU.
-//
-// P represents a permutation matrix encoded as a list of indices each between `0`
-// and `M-1`, inclusive. If P_mat denotes the permutation matrix corresponding to
-// P, then the L, U and P satisfies P_mat * input = L * U.
-//
-// Arguments:
-//	input: A tensor of shape `[..., M, M]` whose inner-most 2 dimensions form matrices of
-// size `[M, M]`.
-//
-// Returns A tensor of shape `[..., M, M]` whose strictly lower triangular part denotes the
-// lower triangular factor `L` with unit diagonal, and whose upper triangular part
-// denotes the upper triangular factor `U`.Permutation of the rows encoded as a list of indices in `0..M-1`. Shape is
-// `[..., M]`.
-// @compatibility(scipy)
-// Similar to `scipy.linalg.lu`, except the triangular factors `L` and `U` are
-// packed into a single tensor, the permutation is applied to `input` instead of
-// the right hand side and the permutation `P` is returned as a list of indices
-// instead of a permutation matrix.
-// @end_compatibility
-func Lu(scope *Scope, input tf.Output, optional ...LuAttr) (lu tf.Output, p tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Lu",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Creates a dataset with a range of values. Corresponds to python's xrange.
-//
-// Arguments:
-//	start: corresponds to start in python's xrange().
-//	stop: corresponds to stop in python's xrange().
-//	step: corresponds to step in python's xrange().
-//
-//
-func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "RangeDataset",
-		Input: []tf.Input{
-			start, stop, step,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the reciprocal of x element-wise.
-//
-// I.e., \\(y = 1 / x\\).
-func Inv(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Inv",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the gradient for the inverse of `x` wrt its input.
-//
-// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-// is the corresponding input gradient.
-func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReciprocalGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the gradient for the rsqrt of `x` wrt its input.
-//
-// Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
-// is the corresponding input gradient.
-func RsqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RsqrtGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes exponential of x element-wise.  \\(y = e^x\\).
-func Exp(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Exp",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes natural logarithm of (1 + x) element-wise.
-//
-// I.e., \\(y = \log_e (1 + x)\\).
-func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Log1p",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes hyperbolic cosine of x element-wise.
-func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Cosh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes hyperbolic tangent of `x` element-wise.
-func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Tanh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes inverse hyperbolic sine of x element-wise.
-func Asinh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Asinh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the truth value of (x != y) element-wise.
-//
-// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "NotEqual",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the gradient for the tanh of `x` wrt its input.
-//
-// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
-// is the corresponding input gradient.
-func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TanhGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Retrieves the tree ensemble resource stamp token, number of trees and growing statistics.
-//
-// Arguments:
-//	tree_ensemble_handle: Handle to the tree ensemble.
-//
-// Returns Stamp token of the tree ensemble resource.The number of trees in the tree ensemble resource.The number of trees that were finished successfully.The number of layers we attempted to build (but not necessarily succeeded).Rank size 2 tensor that contains start and end ids of the nodes in the latest
-// layer.
-func BoostedTreesGetEnsembleStates(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, num_trees tf.Output, num_finalized_trees tf.Output, num_attempted_layers tf.Output, last_layer_nodes_range tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesGetEnsembleStates",
-		Input: []tf.Input{
-			tree_ensemble_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
-// Computes the log of the absolute value of `Gamma(x)` element-wise.
-func Lgamma(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Lgamma",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
-type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
-
-// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Generates labels for candidate sampling with a learned unigram distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
-//
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
-//
-// Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
-//
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ThreadUnsafeUnigramCandidateSampler",
-		Input: []tf.Input{
-			true_classes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
-type RandomStandardNormalAttr func(optionalAttr)
-
-// RandomStandardNormalSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from a normal distribution.
-//
-// The generated values will have mean 0 and standard deviation 1.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
-//
-// Returns A tensor of the specified shape filled with random normal values.
-func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomStandardNormal",
-		Input: []tf.Input{
-			shape,
-		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -29861,1471 +38656,45 @@ func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (cont
 	return op.Output(0)
 }
 
-// Computes the complementary error function of `x` element-wise.
-func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Erfc",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// GatherAttr is an optional argument to Gather.
+type GatherAttr func(optionalAttr)
 
-// Computes sin of x element-wise.
-func Sin(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sin",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the trignometric inverse sine of x element-wise.
-//
-// The `tf.math.asin` operation returns the inverse of `tf.math.sin`, such that
-// if `y = tf.math.sin(x)` then, `x = tf.math.asin(y)`.
-//
-// **Note**: The output of `tf.math.asin` will lie within the invertible range
-// of sine, i.e [-pi/2, pi/2].
-//
-// For example:
-//
-// ```python
-// # Note: [1.047, 0.785] ~= [(pi/3), (pi/4)]
-// x = tf.constant([1.047, 0.785])
-// y = tf.math.sin(x) # [0.8659266, 0.7068252]
-//
-// tf.math.asin(y) # [1.047, 0.785] = x
-// ```
-//
-func Asin(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Asin",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes acos of x element-wise.
-func Acos(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Acos",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Reverses specific dimensions of a tensor.
-//
-// Given a `tensor`, and a `bool` tensor `dims` representing the dimensions
-// of `tensor`, this operation reverses each dimension i of `tensor` where
-// `dims[i]` is `True`.
-//
-// `tensor` can have up to 8 dimensions. The number of dimensions
-// of `tensor` must equal the number of elements in `dims`. In other words:
-//
-// `rank(tensor) = size(dims)`
-//
-// For example:
-//
-// ```
-// # tensor 't' is [[[[ 0,  1,  2,  3],
-// #                  [ 4,  5,  6,  7],
-// #                  [ 8,  9, 10, 11]],
-// #                 [[12, 13, 14, 15],
-// #                  [16, 17, 18, 19],
-// #                  [20, 21, 22, 23]]]]
-// # tensor 't' shape is [1, 2, 3, 4]
-//
-// # 'dims' is [False, False, False, True]
-// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-//                         [ 7,  6,  5,  4],
-//                         [ 11, 10, 9, 8]],
-//                        [[15, 14, 13, 12],
-//                         [19, 18, 17, 16],
-//                         [23, 22, 21, 20]]]]
-//
-// # 'dims' is [False, True, False, False]
-// reverse(t, dims) ==> [[[[12, 13, 14, 15],
-//                         [16, 17, 18, 19],
-//                         [20, 21, 22, 23]
-//                        [[ 0,  1,  2,  3],
-//                         [ 4,  5,  6,  7],
-//                         [ 8,  9, 10, 11]]]]
-//
-// # 'dims' is [False, False, True, False]
-// reverse(t, dims) ==> [[[[8, 9, 10, 11],
-//                         [4, 5, 6, 7],
-//                         [0, 1, 2, 3]]
-//                        [[20, 21, 22, 23],
-//                         [16, 17, 18, 19],
-//                         [12, 13, 14, 15]]]]
-// ```
-//
-// Arguments:
-//	tensor: Up to 8-D.
-//	dims: 1-D. The dimensions to reverse.
-//
-// Returns The same shape as `tensor`.
-func Reverse(scope *Scope, tensor tf.Output, dims tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Reverse",
-		Input: []tf.Input{
-			tensor, dims,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UnpackAttr is an optional argument to Unpack.
-type UnpackAttr func(optionalAttr)
-
-// UnpackAxis sets the optional axis attribute to value.
-//
-// value: Dimension along which to unpack.  Negative values wrap around, so the
-// valid range is `[-R, R)`.
-// If not specified, defaults to 0
-func UnpackAxis(value int64) UnpackAttr {
-	return func(m optionalAttr) {
-		m["axis"] = value
-	}
-}
-
-// Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
-//
-// Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
-// For example, given a tensor of shape `(A, B, C, D)`;
-//
-// If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
-//   and each tensor in `output` will have shape `(B, C, D)`. (Note that the
-//   dimension unpacked along is gone, unlike `split`).
-//
-// If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
-//   and each tensor in `output` will have shape `(A, C, D)`.
-// Etc.
-//
-// This is the opposite of `pack`.
-//
-// Arguments:
-//	value: 1-D or higher, with `axis` dimension size equal to `num`.
-//
-//
-// Returns The list of tensors unpacked from `value`.
-func Unpack(scope *Scope, value tf.Output, num int64, optional ...UnpackAttr) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num": num}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Unpack",
-		Input: []tf.Input{
-			value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("Unpack", err)
-		return
-	}
-	return output
-}
-
-// Computes the trignometric inverse tangent of x element-wise.
-//
-// The `tf.math.atan` operation returns the inverse of `tf.math.tan`, such that
-// if `y = tf.math.tan(x)` then, `x = tf.math.atan(y)`.
-//
-// **Note**: The output of `tf.math.atan` will lie within the invertible range
-// of tan, i.e (-pi/2, pi/2).
-//
-// For example:
-//
-// ```python
-// # Note: [1.047, 0.785] ~= [(pi/3), (pi/4)]
-// x = tf.constant([1.047, 0.785])
-// y = tf.math.tan(x) # [1.731261, 0.99920404]
-//
-// tf.math.atan(y) # [1.047, 0.785] = x
-// ```
-//
-func Atan(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Atan",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SerializeManySparseAttr is an optional argument to SerializeManySparse.
-type SerializeManySparseAttr func(optionalAttr)
-
-// SerializeManySparseOutType sets the optional out_type attribute to value.
-//
-// value: The `dtype` to use for serialization; the supported types are `string`
-// (default) and `variant`.
-// If not specified, defaults to DT_STRING
-func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
-//
-// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
-// is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The serialized
-// `SparseTensor` objects going into each row of `serialized_sparse` will have
-// rank `R-1`.
-//
-// The minibatch size `N` is extracted from `sparse_shape[0]`.
-//
-// Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SerializeManySparse",
-		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResizeAreaAttr is an optional argument to ResizeArea.
-type ResizeAreaAttr func(optionalAttr)
-
-// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// Resize `images` to `size` using area interpolation.
-//
-// Input images can be of different types but output images are always float.
-//
-// The range of pixel values for the output image might be slightly different
-// from the range for the input image because of limited numerical precision.
-// To guarantee an output range, for example `[0.0, 1.0]`, apply
-// `tf.clip_by_value` to the output.
-//
-// Each output pixel is computed by first transforming the pixel's footprint into
-// the input tensor and then averaging the pixels that intersect the footprint. An
-// input pixel's contribution to the average is weighted by the fraction of its
-// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResizeArea",
-		Input: []tf.Input{
-			images, size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the Bessel i0e function of `x` element-wise.
-//
-// Exponentially scaled modified Bessel function of order 0 defined as
-// `bessel_i0e(x) = exp(-abs(x)) bessel_i0(x)`.
-//
-// This function is faster and numerically stabler than `bessel_i0(x)`.
-func BesselI0e(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BesselI0e",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FixedUnigramCandidateSamplerAttr is an optional argument to FixedUnigramCandidateSampler.
-type FixedUnigramCandidateSamplerAttr func(optionalAttr)
-
-// FixedUnigramCandidateSamplerVocabFile sets the optional vocab_file attribute to value.
-//
-// value: Each valid line in this file (which should have a CSV-like format)
-// corresponds to a valid word ID. IDs are in sequential order, starting from
-// num_reserved_ids. The last entry in each line is expected to be a value
-// corresponding to the count or relative probability. Exactly one of vocab_file
-// and unigrams needs to be passed to this op.
-// If not specified, defaults to ""
-func FixedUnigramCandidateSamplerVocabFile(value string) FixedUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["vocab_file"] = value
-	}
-}
-
-// FixedUnigramCandidateSamplerDistortion sets the optional distortion attribute to value.
-//
-// value: The distortion is used to skew the unigram probability distribution.
-// Each weight is first raised to the distortion's power before adding to the
-// internal unigram distribution. As a result, distortion = 1.0 gives regular
-// unigram sampling (as defined by the vocab file), and distortion = 0.0 gives
-// a uniform distribution.
-// If not specified, defaults to 1
-func FixedUnigramCandidateSamplerDistortion(value float32) FixedUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["distortion"] = value
-	}
-}
-
-// FixedUnigramCandidateSamplerNumReservedIds sets the optional num_reserved_ids attribute to value.
-//
-// value: Optionally some reserved IDs can be added in the range [0,
-// ..., num_reserved_ids) by the users. One use case is that a special unknown
-// word token is used as ID 0. These IDs will have a sampling probability of 0.
-// If not specified, defaults to 0
-func FixedUnigramCandidateSamplerNumReservedIds(value int64) FixedUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["num_reserved_ids"] = value
-	}
-}
-
-// FixedUnigramCandidateSamplerNumShards sets the optional num_shards attribute to value.
-//
-// value: A sampler can be used to sample from a subset of the original range
-// in order to speed up the whole computation through parallelism. This parameter
-// (together with 'shard') indicates the number of partitions that are being
-// used in the overall computation.
-// If not specified, defaults to 1
-//
-// REQUIRES: value >= 1
-func FixedUnigramCandidateSamplerNumShards(value int64) FixedUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["num_shards"] = value
-	}
-}
-
-// FixedUnigramCandidateSamplerShard sets the optional shard attribute to value.
-//
-// value: A sampler can be used to sample from a subset of the original range
-// in order to speed up the whole computation through parallelism. This parameter
-// (together with 'num_shards') indicates the particular partition number of a
-// sampler op, when partitioning is being used.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func FixedUnigramCandidateSamplerShard(value int64) FixedUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["shard"] = value
-	}
-}
-
-// FixedUnigramCandidateSamplerUnigrams sets the optional unigrams attribute to value.
-//
-// value: A list of unigram counts or probabilities, one per ID in sequential
-// order. Exactly one of vocab_file and unigrams should be passed to this op.
-// If not specified, defaults to <>
-func FixedUnigramCandidateSamplerUnigrams(value []float32) FixedUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["unigrams"] = value
-	}
-}
-
-// FixedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func FixedUnigramCandidateSamplerSeed(value int64) FixedUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// FixedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func FixedUnigramCandidateSamplerSeed2(value int64) FixedUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Generates labels for candidate sampling with a learned unigram distribution.
-//
-// A unigram sampler could use a fixed unigram distribution read from a
-// file or passed in as an in-memory array instead of building up the distribution
-// from data on the fly. There is also an option to skew the distribution by
-// applying a distortion power to the weights.
-//
-// The vocabulary file should be in CSV-like format, with the last field
-// being the weight associated with the word.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
-//
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
-//
-// Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
-//
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func FixedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...FixedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FixedUnigramCandidateSampler",
-		Input: []tf.Input{
-			true_classes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns 0 if the denominator is zero.
-//
-//
-// *NOTE*: `DivNoNan` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func DivNoNan(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DivNoNan",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TensorSummaryAttr is an optional argument to TensorSummary.
-type TensorSummaryAttr func(optionalAttr)
-
-// TensorSummaryDescription sets the optional description attribute to value.
-//
-// value: A json-encoded SummaryDescription proto.
-// If not specified, defaults to ""
-func TensorSummaryDescription(value string) TensorSummaryAttr {
-	return func(m optionalAttr) {
-		m["description"] = value
-	}
-}
-
-// TensorSummaryLabels sets the optional labels attribute to value.
-//
-// value: An unused list of strings.
-// If not specified, defaults to <>
-func TensorSummaryLabels(value []string) TensorSummaryAttr {
-	return func(m optionalAttr) {
-		m["labels"] = value
-	}
-}
-
-// TensorSummaryDisplayName sets the optional display_name attribute to value.
-//
-// value: An unused string.
-// If not specified, defaults to ""
-func TensorSummaryDisplayName(value string) TensorSummaryAttr {
-	return func(m optionalAttr) {
-		m["display_name"] = value
-	}
-}
-
-// Outputs a `Summary` protocol buffer with a tensor.
-//
-// This op is being phased out in favor of TensorSummaryV2, which lets callers pass
-// a tag as well as a serialized SummaryMetadata proto string that contains
-// plugin-specific data. We will keep this op to maintain backwards compatibility.
-//
-// Arguments:
-//	tensor: A tensor to serialize.
-func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorSummary",
-		Input: []tf.Input{
-			tensor,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
-type QueueDequeueV2Attr func(optionalAttr)
-
-// QueueDequeueV2TimeoutMs sets the optional timeout_ms attribute to value.
-//
-// value: If the queue is empty, this operation will block for up to
-// timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueDequeueV2TimeoutMs(value int64) QueueDequeueV2Attr {
-	return func(m optionalAttr) {
-		m["timeout_ms"] = value
-	}
-}
-
-// Dequeues a tuple of one or more tensors from the given queue.
-//
-// This operation has k outputs, where k is the number of components
-// in the tuples stored in the given queue, and output i is the ith
-// component of the dequeued tuple.
-//
-// N.B. If the queue is empty, this operation will block until an element
-// has been dequeued (or 'timeout_ms' elapses, if specified).
-//
-// Arguments:
-//	handle: The handle to a queue.
-//	component_types: The type of each component in a tuple.
-//
-// Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueV2(scope *Scope, handle tf.Output, component_types []tf.DataType, optional ...QueueDequeueV2Attr) (components []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"component_types": component_types}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QueueDequeueV2",
-		Input: []tf.Input{
-			handle,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueV2", err)
-		return
-	}
-	return components
-}
-
-// Returns which elements of x are NaN.
-//
-// @compatibility(numpy)
-// Equivalent to np.isnan
-// @end_compatibility
-func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IsNan",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns which elements of x are finite.
-//
-// @compatibility(numpy)
-// Equivalent to np.isfinite
-// @end_compatibility
-func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IsFinite",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns an element-wise indication of the sign of a number.
-//
-// `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
-//
-// For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
-func Sign(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sign",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns element-wise largest integer not greater than x.
-func Floor(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Floor",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// A placeholder op for a value that will be fed into the computation.
-//
-// Arguments:
-//	dtype: The type of elements in the tensor.
-//	shape: The shape of the tensor.
-//
-// Returns A tensor that will be provided using the infeed mechanism.
-func InfeedDequeue(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
-	opspec := tf.OpSpec{
-		Type: "InfeedDequeue",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Checks a tensor for NaN and Inf values.
-//
-// When run, reports an `InvalidArgument` error if `tensor` has any values
-// that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
-//
-// Arguments:
-//
-//	message: Prefix of the error message.
-func CheckNumerics(scope *Scope, tensor tf.Output, message string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"message": message}
-	opspec := tf.OpSpec{
-		Type: "CheckNumerics",
-		Input: []tf.Input{
-			tensor,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// PackAttr is an optional argument to Pack.
-type PackAttr func(optionalAttr)
-
-// PackAxis sets the optional axis attribute to value.
-//
-// value: Dimension along which to pack.  Negative values wrap around, so the
-// valid range is `[-(R+1), R+1)`.
-// If not specified, defaults to 0
-func PackAxis(value int64) PackAttr {
-	return func(m optionalAttr) {
-		m["axis"] = value
-	}
-}
-
-// Packs a list of `N` rank-`R` tensors into one rank-`(R+1)` tensor.
-//
-// Packs the `N` tensors in `values` into a tensor with rank one higher than each
-// tensor in `values`, by packing them along the `axis` dimension.
-// Given a list of tensors of shape `(A, B, C)`;
-//
-// if `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.
-// if `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.
-// Etc.
-//
-// For example:
-//
-// ```
-// # 'x' is [1, 4]
-// # 'y' is [2, 5]
-// # 'z' is [3, 6]
-// pack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
-// pack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]
-// ```
-//
-// This is the opposite of `unpack`.
-//
-// Arguments:
-//	values: Must be of same shape and type.
-//
-// Returns The packed tensor.
-func Pack(scope *Scope, values []tf.Output, optional ...PackAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Pack",
-		Input: []tf.Input{
-			tf.OutputList(values),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns element-wise integer closest to x.
-//
-// If the result is midway between two representable values,
-// the even representable is chosen.
-// For example:
-//
-// ```
-// rint(-1.5) ==> -2.0
-// rint(0.5000001) ==> 1.0
-// rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
-// ```
-func Rint(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Rint",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Inverse fast Fourier transform.
-//
-// Computes the inverse 1-dimensional discrete Fourier transform over the
-// inner-most dimension of `input`.
-//
-// Arguments:
-//	input: A complex tensor.
-//
-// Returns A complex tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its inverse 1D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.ifft
-// @end_compatibility
-func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IFFT",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
-//
-// The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
-// `filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
-// input channel is processed independently of the others with its own structuring
-// function. The `output` tensor has shape
-// `[batch, out_height, out_width, depth]`. The spatial dimensions of the output
-// tensor depend on the `padding` algorithm. We currently only support the default
-// "NHWC" `data_format`.
-//
-// In detail, the grayscale morphological 2-D dilation is the max-sum correlation
-// (for consistency with `conv2d`, we use unmirrored filters):
-//
-//     output[b, y, x, c] =
-//        max_{dy, dx} input[b,
-//                           strides[1] * y + rates[1] * dy,
-//                           strides[2] * x + rates[2] * dx,
-//                           c] +
-//                     filter[dy, dx, c]
-//
-// Max-pooling is a special case when the filter has size equal to the pooling
-// kernel size and contains all zeros.
-//
-// Note on duality: The dilation of `input` by the `filter` is equal to the
-// negation of the erosion of `-input` by the reflected `filter`.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: The input stride for atrous morphological dilation. Must be:
-// `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D with shape `[batch, out_height, out_width, depth]`.
-func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, rates []int64, padding string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "Dilation2D",
-		Input: []tf.Input{
-			input, filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResizeBilinearGradAttr is an optional argument to ResizeBilinearGrad.
-type ResizeBilinearGradAttr func(optionalAttr)
-
-// ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
-// aligned. Defaults to false.
-// If not specified, defaults to false
-func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// ResizeBilinearGradHalfPixelCenters sets the optional half_pixel_centers attribute to value.
-// If not specified, defaults to false
-func ResizeBilinearGradHalfPixelCenters(value bool) ResizeBilinearGradAttr {
-	return func(m optionalAttr) {
-		m["half_pixel_centers"] = value
-	}
-}
-
-// Computes the gradient of bilinear interpolation.
-//
-// Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
-// The image tensor that was resized.
-//
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
-// Gradients with respect to the input image. Input image must have been
-// float or double.
-func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBilinearGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResizeBilinearGrad",
-		Input: []tf.Input{
-			grads, original_image,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns x + y element-wise.
-//
-// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func AddV2(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AddV2",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns x - y element-wise.
-//
-// *NOTE*: `Subtract` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sub",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// EditDistanceAttr is an optional argument to EditDistance.
-type EditDistanceAttr func(optionalAttr)
-
-// EditDistanceNormalize sets the optional normalize attribute to value.
-//
-// value: boolean (if true, edit distances are normalized by length of truth).
-//
-// The output is:
+// GatherValidateIndices sets the optional validate_indices attribute to value.
 // If not specified, defaults to true
-func EditDistanceNormalize(value bool) EditDistanceAttr {
+func GatherValidateIndices(value bool) GatherAttr {
 	return func(m optionalAttr) {
-		m["normalize"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Computes the (possibly normalized) Levenshtein Edit Distance.
+// Gather slices from `params` according to `indices`.
 //
-// The inputs are variable-length sequences provided by SparseTensors
-//   (hypothesis_indices, hypothesis_values, hypothesis_shape)
-// and
-//   (truth_indices, truth_values, truth_shape).
-//
-// The inputs are:
-//
-// Arguments:
-//	hypothesis_indices: The indices of the hypothesis list SparseTensor.
-// This is an N x R int64 matrix.
-//	hypothesis_values: The values of the hypothesis list SparseTensor.
-// This is an N-length vector.
-//	hypothesis_shape: The shape of the hypothesis list SparseTensor.
-// This is an R-length vector.
-//	truth_indices: The indices of the truth list SparseTensor.
-// This is an M x R int64 matrix.
-//	truth_values: The values of the truth list SparseTensor.
-// This is an M-length vector.
-//	truth_shape: truth indices, vector.
-//
-// Returns A dense float tensor with rank R - 1.
-//
-// For the example input:
-//
-//     // hypothesis represents a 2x1 matrix with variable-length values:
-//     //   (0,0) = ["a"]
-//     //   (1,0) = ["b"]
-//     hypothesis_indices = [[0, 0, 0],
-//                           [1, 0, 0]]
-//     hypothesis_values = ["a", "b"]
-//     hypothesis_shape = [2, 1, 1]
-//
-//     // truth represents a 2x2 matrix with variable-length values:
-//     //   (0,0) = []
-//     //   (0,1) = ["a"]
-//     //   (1,0) = ["b", "c"]
-//     //   (1,1) = ["a"]
-//     truth_indices = [[0, 1, 0],
-//                      [1, 0, 0],
-//                      [1, 0, 1],
-//                      [1, 1, 0]]
-//     truth_values = ["a", "b", "c", "a"]
-//     truth_shape = [2, 2, 2]
-//     normalize = true
-//
-// The output will be:
-//
-//     // output is a 2x2 matrix with edit distances normalized by truth lengths.
-//     output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
-//               [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
-func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values tf.Output, hypothesis_shape tf.Output, truth_indices tf.Output, truth_values tf.Output, truth_shape tf.Output, optional ...EditDistanceAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EditDistance",
-		Input: []tf.Input{
-			hypothesis_indices, hypothesis_values, hypothesis_shape, truth_indices, truth_values, truth_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the Bessel i1e function of `x` element-wise.
-//
-// Exponentially scaled modified Bessel function of order 0 defined as
-// `bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
-//
-// This function is faster and numerically stabler than `bessel_i1(x)`.
-func BesselI1e(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BesselI1e",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns x * y element-wise. Returns zero if y is zero, even if x if infinite or NaN.
-//
-// *NOTE*: `Mul` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func MulNoNan(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MulNoNan",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// A container for an iterator resource.
-//
-// Returns A handle to the iterator that can be passed to a "MakeIterator"
-// or "IteratorGetNext" op.
-func Iterator(scope *Scope, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "Iterator",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AvgPoolAttr is an optional argument to AvgPool.
-type AvgPoolAttr func(optionalAttr)
-
-// AvgPoolDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolDataFormat(value string) AvgPoolAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs average pooling on the input.
-//
-// Each entry in `output` is the mean of the corresponding size `ksize`
-// window in `value`.
-//
-// Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	ksize: The size of the sliding window for each dimension of `value`.
-//	strides: The stride of the sliding window for each dimension of `value`.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The average pooled output tensor.
-func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AvgPool",
-		Input: []tf.Input{
-			value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns x / y element-wise.
-//
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Div",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns x / y element-wise for real types.
-//
-// If `x` and `y` are reals, this will return the floating-point division.
-//
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RealDiv",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SumAttr is an optional argument to Sum.
-type SumAttr func(optionalAttr)
-
-// SumKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SumKeepDims(value bool) SumAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the sum of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func Sum(scope *Scope, input tf.Output, axis tf.Output, optional ...SumAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Sum",
-		Input: []tf.Input{
-			input, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Scatter `updates` into a new tensor according to `indices`.
-//
-// Creates a new tensor by applying sparse `updates` to individual values or
-// slices within a tensor (initially zero for numeric, empty for string) of
-// the given `shape` according to indices.  This operator is the inverse of the
-// `tf.gather_nd` operator which extracts values or slices from a given tensor.
-//
-// This operation is similar to tensor_scatter_add, except that the tensor is
-// zero-initialized. Calling `tf.scatter_nd(indices, values, shape)` is identical
-// to `tensor_scatter_add(tf.zeros(shape, values.dtype), indices, values)`
-//
-// If `indices` contains duplicates, then their updates are accumulated (summed).
-//
-// **WARNING**: The order in which updates are applied is nondeterministic, so the
-// output will be nondeterministic if `indices` contains duplicates -- because
-// of some numerical approximation issues, numbers summed in different order
-// may yield different results.
-//
-// `indices` is an integer tensor containing indices into a new tensor of shape
-// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
-//
-//     indices.shape[-1] <= shape.rank
-//
-// The last dimension of `indices` corresponds to indices into elements
-// (if `indices.shape[-1] = shape.rank`) or slices
-// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
-// `shape`.  `updates` is a tensor with shape
-//
-//     indices.shape[:-1] + shape[indices.shape[-1]:]
-//
-// The simplest form of scatter is to insert individual elements in a tensor by
-// index. For example, say we want to insert 4 scattered elements in a rank-1
-// tensor with 8 elements.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
-// </div>
-//
-// In Python, this scatter operation would look like this:
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
 //
 // ```python
-//     indices = tf.constant([[4], [3], [1], [7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     shape = tf.constant([8])
-//     scatter = tf.scatter_nd(indices, updates, shape)
-//     with tf.Session() as sess:
-//       print(sess.run(scatter))
+//     # Scalar indices
+//     output[:, ..., :] = params[indices, :, ... :]
+//
+//     # Vector indices
+//     output[i, :, ..., :] = params[indices[i], :, ... :]
+//
+//     # Higher rank indices
+//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
 // ```
 //
-// The resulting tensor would look like this:
+// If `indices` is a permutation and `len(indices) == params.shape[0]` then
+// this operation will permute `params` accordingly.
 //
-//     [0, 11, 0, 10, 9, 0, 0, 12]
-//
-// We can also, insert entire slices of a higher rank tensor all at once. For
-// example, if we wanted to insert two slices in the first dimension of a
-// rank-3 tensor with two matrices of new values.
+// `validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in
+// `indices` are always validated to be within range. If assigned to GPU,
+// out-of-bound indices result in safe but unspecified behavior, which may include
+// raising an error.
 //
 // <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd2.png" alt>
+// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
 // </div>
-//
-// In Python, this scatter operation would look like this:
-//
-// ```python
-//     indices = tf.constant([[0], [2]])
-//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]],
-//                            [[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
-//     shape = tf.constant([4, 4, 4])
-//     scatter = tf.scatter_nd(indices, updates, shape)
-//     with tf.Session() as sess:
-//       print(sess.run(scatter))
-// ```
-//
-// The resulting tensor would look like this:
-//
-//     [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
-//      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
-//
-// Note that on CPU, if an out of bound index is found, an error is returned.
-// On GPU, if an out of bound index is found, the index is ignored.
-//
-// Arguments:
-//	indices: Index tensor.
-//	updates: Updates to scatter into output.
-//	shape: 1-D. The shape of the resulting tensor.
-//
-// Returns A new tensor with the given shape and updates applied according
-// to the indices.
-func ScatterNd(scope *Scope, indices tf.Output, updates tf.Output, shape tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ScatterNd",
-		Input: []tf.Input{
-			indices, updates, shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns (x - y)(x - y) element-wise.
-//
-// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SquaredDifference",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns 0 if x == 0, and x / y otherwise, elementwise.
-func Xdivy(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Xdivy",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
-type ResourceApplyPowerSignAttr func(optionalAttr)
-
-// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and m tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the AddSign update.
-//
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
-// variable <- variable - lr_t * update
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	logbase: Must be a scalar.
-//	sign_decay: Must be a scalar.
-//	beta: Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
+func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...GatherAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -31334,1533 +38703,9 @@ func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Out
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyPowerSign",
+		Type: "Gather",
 		Input: []tf.Input{
-			var_, m, lr, logbase, sign_decay, beta, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
-//
-// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Minimum",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns element-wise remainder of division. This emulates C semantics in that
-//
-// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
-// y + truncate_mod(x, y) = x`.
-//
-// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TruncateMod",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Compute the lower regularized incomplete Gamma function `P(a, x)`.
-//
-// The lower regularized incomplete Gamma function is defined as:
-//
-//
-// \\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
-//
-// where
-//
-// \\(gamma(a, x) = \\int_{0}^{x} t^{a-1} exp(-t) dt\\)
-//
-// is the lower incomplete Gamma function.
-//
-// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
-// Gamma function.
-func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Igamma",
-		Input: []tf.Input{
-			a, x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Outputs a tensor containing the reduction across all input tensors.
-//
-// Outputs a tensor containing the reduction across all input tensors passed to ops
-// within the same `shared_name.
-//
-// The graph should be constructed so if one op runs with shared_name value `c`,
-// then `num_devices` ops will run with shared_name value `c`.  Failure to do so
-// will cause the graph execution to fail to complete.
-//
-// input: the input to the reduction
-// data: the value of the reduction across all `num_devices` devices.
-// reduction: the reduction operation to perform.
-// num_devices: The number of devices participating in this reduction.
-// shared_name: Identifier that shared between ops of the same reduction.
-func NcclAllReduce(scope *Scope, input tf.Output, reduction string, num_devices int64, shared_name string) (data tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"reduction": reduction, "num_devices": num_devices, "shared_name": shared_name}
-	opspec := tf.OpSpec{
-		Type: "NcclAllReduce",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizedMulAttr is an optional argument to QuantizedMul.
-type QuantizedMulAttr func(optionalAttr)
-
-// QuantizedMulToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMulToutput(value tf.DataType) QuantizedMulAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
-}
-
-// Returns x * y element-wise, working on quantized buffers.
-//
-// Arguments:
-//
-//
-//	min_x: The float value that the lowest quantized `x` value represents.
-//	max_x: The float value that the highest quantized `x` value represents.
-//	min_y: The float value that the lowest quantized `y` value represents.
-//	max_y: The float value that the highest quantized `y` value represents.
-//
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-//
-// *NOTE*: `QuantizedMul` supports limited forms of broadcasting. More about
-// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func QuantizedMul(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedMulAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedMul",
-		Input: []tf.Input{
-			x, y, min_x, max_x, min_y, max_y,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// MinAttr is an optional argument to Min.
-type MinAttr func(optionalAttr)
-
-// MinKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MinKeepDims(value bool) MinAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the minimum of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Min",
-		Input: []tf.Input{
-			input, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
-//
-// The Hurwitz zeta function is defined as:
-//
-//
-// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
-func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Zeta",
-		Input: []tf.Input{
-			x, q,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
-//
-// This is the angle \( \theta \in [-\pi, \pi] \) such that
-// \[ x = r \cos(\theta) \]
-// and
-// \[ y = r \sin(\theta) \]
-// where \(r = \sqrt(x^2 + y^2) \).
-func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Atan2",
-		Input: []tf.Input{
-			y, x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// PrelinearizeAttr is an optional argument to Prelinearize.
-type PrelinearizeAttr func(optionalAttr)
-
-// PrelinearizeShape sets the optional shape attribute to value.
-//
-// value: The shape of the tensor.
-// If not specified, defaults to <>
-func PrelinearizeShape(value tf.Shape) PrelinearizeAttr {
-	return func(m optionalAttr) {
-		m["shape"] = value
-	}
-}
-
-// PrelinearizeLayout sets the optional layout attribute to value.
-//
-// value: A vector holding the requested layout in minor-to-major sequence. If a layout
-// attribute is passed but its values are all -1 the layout will be computed by
-// the infeed operation.
-// If not specified, defaults to <>
-func PrelinearizeLayout(value []int64) PrelinearizeAttr {
-	return func(m optionalAttr) {
-		m["layout"] = value
-	}
-}
-
-// An op which linearizes one Tensor value to an opaque variant tensor.
-//
-// Arguments:
-//	input: A tensor that will be linearized.
-func Prelinearize(scope *Scope, input tf.Output, optional ...PrelinearizeAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Prelinearize",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
-//
-// The regularized incomplete beta integral is defined as:
-//
-//
-// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
-//
-// where
-//
-//
-// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
-//
-//
-// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
-// beta function.
-func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Betainc",
-		Input: []tf.Input{
-			a, b, x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// OrderedMapIncompleteSizeAttr is an optional argument to OrderedMapIncompleteSize.
-type OrderedMapIncompleteSizeAttr func(optionalAttr)
-
-// OrderedMapIncompleteSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapIncompleteSizeCapacity(value int64) OrderedMapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// OrderedMapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapIncompleteSizeMemoryLimit(value int64) OrderedMapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapIncompleteSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapIncompleteSizeContainer(value string) OrderedMapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapIncompleteSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapIncompleteSizeSharedName(value string) OrderedMapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op returns the number of incomplete elements in the underlying container.
-func OrderedMapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapIncompleteSizeAttr) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "OrderedMapIncompleteSize",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the truth value of (x < y) element-wise.
-//
-// *NOTE*: `Less` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Less",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DataFormatVecPermuteAttr is an optional argument to DataFormatVecPermute.
-type DataFormatVecPermuteAttr func(optionalAttr)
-
-// DataFormatVecPermuteSrcFormat sets the optional src_format attribute to value.
-//
-// value: source data format.
-// If not specified, defaults to "NHWC"
-func DataFormatVecPermuteSrcFormat(value string) DataFormatVecPermuteAttr {
-	return func(m optionalAttr) {
-		m["src_format"] = value
-	}
-}
-
-// DataFormatVecPermuteDstFormat sets the optional dst_format attribute to value.
-//
-// value: destination data format.
-// If not specified, defaults to "NCHW"
-func DataFormatVecPermuteDstFormat(value string) DataFormatVecPermuteAttr {
-	return func(m optionalAttr) {
-		m["dst_format"] = value
-	}
-}
-
-// Returns the permuted vector/tensor in the destination data format given the
-//
-// one in the source data format.
-//
-// Arguments:
-//	x: Vector of size 4 or Tensor of shape (4, 2) in source data format.
-//
-// Returns Vector of size 4 or Tensor of shape (4, 2) in destination data format.
-func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPermuteAttr) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DataFormatVecPermute",
-		Input: []tf.Input{
-			x,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MeanAttr is an optional argument to Mean.
-type MeanAttr func(optionalAttr)
-
-// MeanKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MeanKeepDims(value bool) MeanAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the mean of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func Mean(scope *Scope, input tf.Output, axis tf.Output, optional ...MeanAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Mean",
-		Input: []tf.Input{
-			input, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Updates the table to associates keys with values.
-//
-// The tensor `keys` must be of the same type as the keys of the table.
-// The tensor `values` must be of the type of the table values.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
-//	values: Values to associate with keys.
-//
-// Returns the created operation.
-func LookupTableInsertV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LookupTableInsertV2",
-		Input: []tf.Input{
-			table_handle, keys, values,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns the truth value of (x >= y) element-wise.
-//
-// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "GreaterEqual",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TPUReplicateMetadataAttr is an optional argument to TPUReplicateMetadata.
-type TPUReplicateMetadataAttr func(optionalAttr)
-
-// TPUReplicateMetadataNumCoresPerReplica sets the optional num_cores_per_replica attribute to value.
-//
-// value: Number of cores per replica. Used for model parallelism.
-// If not specified, defaults to 1
-func TPUReplicateMetadataNumCoresPerReplica(value int64) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["num_cores_per_replica"] = value
-	}
-}
-
-// TPUReplicateMetadataTopology sets the optional topology attribute to value.
-//
-// value: TopologyProto indicating the topology of the TPU pod slice.
-// If not specified, defaults to ""
-func TPUReplicateMetadataTopology(value string) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["topology"] = value
-	}
-}
-
-// TPUReplicateMetadataUseTpu sets the optional use_tpu attribute to value.
-//
-// value: Whether to place the computation on the TPU.
-// If not specified, defaults to true
-func TPUReplicateMetadataUseTpu(value bool) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["use_tpu"] = value
-	}
-}
-
-// TPUReplicateMetadataDeviceAssignment sets the optional device_assignment attribute to value.
-//
-// value: The assignment of devices for the computation.
-// If not specified, defaults to <>
-func TPUReplicateMetadataDeviceAssignment(value []int64) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["device_assignment"] = value
-	}
-}
-
-// TPUReplicateMetadataComputationShape sets the optional computation_shape attribute to value.
-//
-// value: DEPRECATED. Use num_cores_per_replica instead.
-// If not specified, defaults to <>
-func TPUReplicateMetadataComputationShape(value []int64) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["computation_shape"] = value
-	}
-}
-
-// TPUReplicateMetadataHostComputeCore sets the optional host_compute_core attribute to value.
-// If not specified, defaults to <>
-func TPUReplicateMetadataHostComputeCore(value []string) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["host_compute_core"] = value
-	}
-}
-
-// TPUReplicateMetadataPaddingMap sets the optional padding_map attribute to value.
-// If not specified, defaults to <>
-func TPUReplicateMetadataPaddingMap(value []string) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["padding_map"] = value
-	}
-}
-
-// TPUReplicateMetadataStepMarkerLocation sets the optional step_marker_location attribute to value.
-// If not specified, defaults to "STEP_MARK_AT_ENTRY"
-func TPUReplicateMetadataStepMarkerLocation(value string) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["step_marker_location"] = value
-	}
-}
-
-// Metadata indicaitng how the TPU computation should be replicated.
-//
-// Arguments:
-//	num_replicas: Number of replicas of the computation
-//
-// Returns the created operation.
-func TPUReplicateMetadata(scope *Scope, num_replicas int64, optional ...TPUReplicateMetadataAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_replicas": num_replicas}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TPUReplicateMetadata",
-
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns the truth value of (x == y) element-wise.
-//
-// *NOTE*: `Equal` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Equal(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Equal",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ApproximateEqualAttr is an optional argument to ApproximateEqual.
-type ApproximateEqualAttr func(optionalAttr)
-
-// ApproximateEqualTolerance sets the optional tolerance attribute to value.
-// If not specified, defaults to 1e-05
-func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
-	return func(m optionalAttr) {
-		m["tolerance"] = value
-	}
-}
-
-// Returns the truth value of abs(x-y) < tolerance element-wise.
-func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ApproximateEqual",
-		Input: []tf.Input{
-			x, y,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// BatchDatasetV2Attr is an optional argument to BatchDatasetV2.
-type BatchDatasetV2Attr func(optionalAttr)
-
-// BatchDatasetV2ParallelCopy sets the optional parallel_copy attribute to value.
-// If not specified, defaults to false
-func BatchDatasetV2ParallelCopy(value bool) BatchDatasetV2Attr {
-	return func(m optionalAttr) {
-		m["parallel_copy"] = value
-	}
-}
-
-// Creates a dataset that batches `batch_size` elements from `input_dataset`.
-//
-// Arguments:
-//
-//	batch_size: A scalar representing the number of elements to accumulate in a batch.
-//	drop_remainder: A scalar representing whether the last batch should be dropped in case its size
-// is smaller than desired.
-//
-//
-func BatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...BatchDatasetV2Attr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "BatchDatasetV2",
-		Input: []tf.Input{
-			input_dataset, batch_size, drop_remainder,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MatMulAttr is an optional argument to MatMul.
-type MatMulAttr func(optionalAttr)
-
-// MatMulTransposeA sets the optional transpose_a attribute to value.
-//
-// value: If true, "a" is transposed before multiplication.
-// If not specified, defaults to false
-func MatMulTransposeA(value bool) MatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
-}
-
-// MatMulTransposeB sets the optional transpose_b attribute to value.
-//
-// value: If true, "b" is transposed before multiplication.
-// If not specified, defaults to false
-func MatMulTransposeB(value bool) MatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// Multiply the matrix "a" by the matrix "b".
-//
-// The inputs must be two-dimensional matrices and the inner dimension of
-// "a" (after being transposed if transpose_a is true) must match the
-// outer dimension of "b" (after being transposed if transposed_b is
-// true).
-//
-// *Note*: The default kernel implementation for MatMul on GPUs uses
-// cublas.
-func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MatMul",
-		Input: []tf.Input{
-			a, b,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LoadTPUEmbeddingStochasticGradientDescentParametersAttr is an optional argument to LoadTPUEmbeddingStochasticGradientDescentParameters.
-type LoadTPUEmbeddingStochasticGradientDescentParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingStochasticGradientDescentParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingStochasticGradientDescentParametersTableId(value int64) LoadTPUEmbeddingStochasticGradientDescentParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingStochasticGradientDescentParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingStochasticGradientDescentParametersTableName(value string) LoadTPUEmbeddingStochasticGradientDescentParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load SGD embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the stochastic gradient descent optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingStochasticGradientDescentParameters(scope *Scope, parameters tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingStochasticGradientDescentParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingStochasticGradientDescentParameters",
-		Input: []tf.Input{
-			parameters,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// ArgMaxAttr is an optional argument to ArgMax.
-type ArgMaxAttr func(optionalAttr)
-
-// ArgMaxOutputType sets the optional output_type attribute to value.
-// If not specified, defaults to DT_INT64
-func ArgMaxOutputType(value tf.DataType) ArgMaxAttr {
-	return func(m optionalAttr) {
-		m["output_type"] = value
-	}
-}
-
-// Returns the index with the largest value across dimensions of a tensor.
-//
-// Note that in case of ties the identity of the return value is not guaranteed.
-//
-// Usage:
-//   ```python
-//   import tensorflow as tf
-//   a = [1, 10, 26.9, 2.8, 166.32, 62.3]
-//   b = tf.math.argmax(input = a)
-//   c = tf.keras.backend.eval(b)
-//   # c = 4
-//   # here a[4] = 166.32 which is the largest element of a across axis 0
-//   ```
-//
-// Arguments:
-//
-//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
-// Describes which dimension of the input Tensor to reduce across. For vectors,
-// use dimension = 0.
-func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMaxAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ArgMax",
-		Input: []tf.Input{
-			input, dimension,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ArgMinAttr is an optional argument to ArgMin.
-type ArgMinAttr func(optionalAttr)
-
-// ArgMinOutputType sets the optional output_type attribute to value.
-// If not specified, defaults to DT_INT64
-func ArgMinOutputType(value tf.DataType) ArgMinAttr {
-	return func(m optionalAttr) {
-		m["output_type"] = value
-	}
-}
-
-// Returns the index with the smallest value across dimensions of a tensor.
-//
-// Note that in case of ties the identity of the return value is not guaranteed.
-//
-// Usage:
-//   ```python
-//   import tensorflow as tf
-//   a = [1, 10, 26.9, 2.8, 166.32, 62.3]
-//   b = tf.math.argmin(input = a)
-//   c = tf.keras.backend.eval(b)
-//   # c = 0
-//   # here a[0] = 1 which is the smallest element of a across axis 0
-//   ```
-//
-// Arguments:
-//
-//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
-// Describes which dimension of the input Tensor to reduce across. For vectors,
-// use dimension = 0.
-func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMinAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ArgMin",
-		Input: []tf.Input{
-			input, dimension,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the mean along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
-// over `j` such that `segment_ids[j] == i` and `N` is the total number of
-// values summed.
-//
-// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
-// </div>
-//
-// For example:
-//
-// ```
-// c = tf.constant([[1.0,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
-// tf.segment_mean(c, tf.constant([0, 0, 1]))
-// # ==> [[2.5, 2.5, 2.5, 2.5],
-// #      [5, 6, 7, 8]]
-// ```
-//
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SegmentMean",
-		Input: []tf.Input{
-			data, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Decodes a `variant` Tensor into a `RaggedTensor`.
-//
-// Decodes the given `variant` Tensor and returns a `RaggedTensor`. The input
-// could be a scalar, meaning it encodes a single `RaggedTensor` with ragged_rank
-// `output_ragged_rank`. It could also have an arbitrary rank, in which case each
-// element is decoded into a `RaggedTensor` with ragged_rank `input_ragged_rank`
-// and these are then stacked according to the input shape to output a single
-// `RaggedTensor` with ragged_rank `output_ragged_rank`. Each `variant` element in
-// the input Tensor is decoded by retrieving from the element a 1-D `variant`
-// Tensor with `input_ragged_rank + 1` Tensors, corresponding to the splits and
-// values of the decoded `RaggedTensor`. If `input_ragged_rank` is -1, then it is
-// inferred as `output_ragged_rank` - `rank(encoded_ragged)`. See
-// `RaggedTensorToVariant` for the corresponding encoding logic.
-//
-//
-// Arguments:
-//	encoded_ragged: A `variant` Tensor containing encoded `RaggedTensor`s.
-//	input_ragged_rank: The ragged rank of each encoded `RaggedTensor` component in the input. If set to
-// -1, this is inferred as `output_ragged_rank` - `rank(encoded_ragged)`
-//	output_ragged_rank: The expected ragged rank of the output `RaggedTensor`. The following must hold:
-// `output_ragged_rank = rank(encoded_ragged) + input_ragged_rank`.
-//
-//
-//
-// Returns A list of one or more Tensors representing the splits of the output
-// `RaggedTensor`.A Tensor representing the values of the output `RaggedTensor`.
-func RaggedTensorFromVariant(scope *Scope, encoded_ragged tf.Output, input_ragged_rank int64, output_ragged_rank int64, Tvalues tf.DataType, Tsplits tf.DataType) (output_nested_splits []tf.Output, output_dense_values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"input_ragged_rank": input_ragged_rank, "output_ragged_rank": output_ragged_rank, "Tvalues": Tvalues, "Tsplits": Tsplits}
-	opspec := tf.OpSpec{
-		Type: "RaggedTensorFromVariant",
-		Input: []tf.Input{
-			encoded_ragged,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output_nested_splits, idx, err = makeOutputList(op, idx, "output_nested_splits"); err != nil {
-		scope.UpdateErr("RaggedTensorFromVariant", err)
-		return
-	}
-	output_dense_values = op.Output(idx)
-	return output_nested_splits, output_dense_values
-}
-
-// Computes the product along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \prod_j data_j\\) where the product is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the product is empty for a given segment ID `i`, `output[i] = 1`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
-// </div>
-//
-// For example:
-//
-// ```
-// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
-// tf.segment_prod(c, tf.constant([0, 0, 1]))
-// # ==> [[4, 6, 6, 4],
-// #      [5, 6, 7, 8]]
-// ```
-//
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SegmentProd",
-		Input: []tf.Input{
-			data, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the product along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// This operator is similar to the unsorted segment sum operator found
-// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-// Instead of computing the sum over segments, it computes the product of all
-// entries belonging to a segment such that:
-//
-// \\(output_i = \prod_{j...} data[j...]\\) where the product is over tuples
-// `j...` such that `segment_ids[j...] == i`.
-//
-// For example:
-//
-// ``` python
-// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
-// tf.unsorted_segment_prod(c, tf.constant([0, 1, 0]), num_segments=2)
-// # ==> [[ 4,  6, 6, 4],
-// #       [5,  6, 7, 8]]
-// ```
-//
-// If there is no entry for a given segment ID `i`, it outputs 1.
-//
-// If the given segment ID `i` is negative, then the corresponding value is
-// dropped, and will not be included in the result.
-//
-// Arguments:
-//
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
-//
-//
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentProd",
-		Input: []tf.Input{
-			data, segment_ids, num_segments,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes softmax cross entropy cost and gradients to backpropagate.
-//
-// Inputs are the logits, not probabilities.
-//
-// Arguments:
-//	features: batch_size x num_classes matrix
-//	labels: batch_size x num_classes matrix
-// The caller must ensure that each batch of labels represents a valid
-// probability distribution.
-//
-// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
-func SoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SoftmaxCrossEntropyWithLogits",
-		Input: []tf.Input{
-			features, labels,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Computes the sum along sparse segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
-// dimension, selecting a subset of dimension 0, specified by `indices`.
-//
-// For example:
-//
-// ```python
-// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
-//
-// # Select two rows, one segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
-// # => [[0 0 0 0]]
-//
-// # Select two rows, two segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
-// # => [[ 1  2  3  4]
-// #     [-1 -2 -3 -4]]
-//
-// # Select all rows, two segments.
-// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
-// # => [[0 0 0 0]
-// #     [5 6 7 8]]
-//
-// # Which is equivalent to:
-// tf.segment_sum(c, tf.constant([0, 0, 1]))
-// ```
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentSum",
-		Input: []tf.Input{
-			data, indices, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RandomCropAttr is an optional argument to RandomCrop.
-type RandomCropAttr func(optionalAttr)
-
-// RandomCropSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomCropSeed(value int64) RandomCropAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomCropSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomCropSeed2(value int64) RandomCropAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Randomly crop `image`.
-//
-// DEPRECATED at GraphDef version 8: Random crop is now pure Python
-//
-// `size` is a 1-D int64 tensor with 2 elements representing the crop height and
-// width.  The values must be non negative.
-//
-// This Op picks a random location in `image` and crops a `height` by `width`
-// rectangle from that location.  The random location is picked so the cropped
-// area will fit inside the original image.
-//
-// Arguments:
-//	image: 3-D of shape `[height, width, channels]`.
-//	size: 1-D of length 2 containing: `crop_height`, `crop_width`..
-//
-// Returns 3-D of shape `[crop_height, crop_width, channels].`
-func RandomCrop(scope *Scope, image tf.Output, size tf.Output, optional ...RandomCropAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomCrop",
-		Input: []tf.Input{
-			image, size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes gradients for SparseSegmentMean.
-//
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
-//
-// Arguments:
-//	grad: gradient propagated to the SparseSegmentMean op.
-//	indices: indices passed to the corresponding SparseSegmentMean op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
-func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentMeanGrad",
-		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Serializes the tree handle to a proto
-//
-// Arguments:
-//	tree_handle: Handle to the tree resource to be serialized.
-//
-// Returns Serialied proto string of the tree resource.
-func TensorForestTreeSerialize(scope *Scope, tree_handle tf.Output) (tree_config tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorForestTreeSerialize",
-		Input: []tf.Input{
-			tree_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Calculate product with tridiagonal matrix.
-//
-// Calculates product of two matrices, where left matrix is a tridiagonal matrix.
-//
-// Arguments:
-//	superdiag: Tensor of shape `[..., 1, M]`, representing superdiagonals of
-// tri-diagonal matrices to the left of multiplication. Last element is ingored.
-//	maindiag: Tensor of shape `[..., 1, M]`, representing main diagonals of tri-diagonal
-// matrices to the left of multiplication.
-//	subdiag: Tensor of shape `[..., 1, M]`, representing subdiagonals of tri-diagonal
-// matrices to the left of multiplication. First element is ingored.
-//	rhs: Tensor of shape `[..., M, N]`, representing MxN matrices to the right of
-// multiplication.
-//
-// Returns Tensor of shape `[..., M, N]` containing the product.
-func TridiagonalMatMul(scope *Scope, superdiag tf.Output, maindiag tf.Output, subdiag tf.Output, rhs tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TridiagonalMatMul",
-		Input: []tf.Input{
-			superdiag, maindiag, subdiag, rhs,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingFTRLParametersAttr is an optional argument to RetrieveTPUEmbeddingFTRLParameters.
-type RetrieveTPUEmbeddingFTRLParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingFTRLParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingFTRLParametersTableId(value int64) RetrieveTPUEmbeddingFTRLParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingFTRLParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingFTRLParametersTableName(value string) RetrieveTPUEmbeddingFTRLParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve FTRL embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the FTRL optimization algorithm.Parameter accumulators updated by the FTRL optimization algorithm.Parameter linears updated by the FTRL optimization algorithm.
-func RetrieveTPUEmbeddingFTRLParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingFTRLParametersAttr) (parameters tf.Output, accumulators tf.Output, linears tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingFTRLParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// QuantizedResizeBilinearAttr is an optional argument to QuantizedResizeBilinear.
-type QuantizedResizeBilinearAttr func(optionalAttr)
-
-// QuantizedResizeBilinearAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func QuantizedResizeBilinearAlignCorners(value bool) QuantizedResizeBilinearAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// QuantizedResizeBilinearHalfPixelCenters sets the optional half_pixel_centers attribute to value.
-// If not specified, defaults to false
-func QuantizedResizeBilinearHalfPixelCenters(value bool) QuantizedResizeBilinearAttr {
-	return func(m optionalAttr) {
-		m["half_pixel_centers"] = value
-	}
-}
-
-// Resize quantized `images` to `size` using quantized bilinear interpolation.
-//
-// Input images and output images must be quantized types.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//
-//
-//
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min tf.Output, max tf.Output, optional ...QuantizedResizeBilinearAttr) (resized_images tf.Output, out_min tf.Output, out_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedResizeBilinear",
-		Input: []tf.Input{
-			images, size, min, max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Computes gradients for SparseSegmentSqrtN.
-//
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
-//
-// Arguments:
-//	grad: gradient propagated to the SparseSegmentSqrtN op.
-//	indices: indices passed to the corresponding SparseSegmentSqrtN op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
-func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtNGrad",
-		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes offsets of concat inputs within its output.
-//
-// For example:
-//
-// ```
-// # 'x' is [2, 2, 7]
-// # 'y' is [2, 3, 7]
-// # 'z' is [2, 5, 7]
-// concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
-// ```
-//
-// This is typically used by gradient computations for a concat operation.
-//
-// Arguments:
-//	concat_dim: The dimension along which to concatenate.
-//	shape: The `N` int32 vectors representing shape of tensors being concatenated.
-//
-// Returns The `N` int32 vectors representing the starting offset
-// of input tensors within the concatenated output.
-func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ConcatOffset",
-		Input: []tf.Input{
-			concat_dim, tf.OutputList(shape),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if offset, idx, err = makeOutputList(op, idx, "offset"); err != nil {
-		scope.UpdateErr("ConcatOffset", err)
-		return
-	}
-	return offset
-}
-
-// AllAttr is an optional argument to All.
-type AllAttr func(optionalAttr)
-
-// AllKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func AllKeepDims(value bool) AllAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the "logical and" of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func All(scope *Scope, input tf.Output, axis tf.Output, optional ...AllAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "All",
-		Input: []tf.Input{
-			input, axis,
+			params, indices,
 		},
 		Attrs: attrs,
 	}
@@ -32891,3857 +38736,6 @@ func LoopCond(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// CudnnRNNBackpropV3Attr is an optional argument to CudnnRNNBackpropV3.
-type CudnnRNNBackpropV3Attr func(optionalAttr)
-
-// CudnnRNNBackpropV3RnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNBackpropV3RnnMode(value string) CudnnRNNBackpropV3Attr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNBackpropV3InputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNBackpropV3InputMode(value string) CudnnRNNBackpropV3Attr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
-
-// CudnnRNNBackpropV3Direction sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNBackpropV3Direction(value string) CudnnRNNBackpropV3Attr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
-
-// CudnnRNNBackpropV3Dropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropV3Dropout(value float32) CudnnRNNBackpropV3Attr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
-	}
-}
-
-// CudnnRNNBackpropV3Seed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropV3Seed(value int64) CudnnRNNBackpropV3Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// CudnnRNNBackpropV3Seed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropV3Seed2(value int64) CudnnRNNBackpropV3Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// CudnnRNNBackpropV3TimeMajor sets the optional time_major attribute to value.
-// If not specified, defaults to true
-func CudnnRNNBackpropV3TimeMajor(value bool) CudnnRNNBackpropV3Attr {
-	return func(m optionalAttr) {
-		m["time_major"] = value
-	}
-}
-
-// Backprop step of CudnnRNNV3.
-//
-// Compute the backprop of both data and weights in a RNN. Takes an extra
-//     "sequence_lengths" input than CudnnRNNBackprop.
-//
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicates whether there is a linear projection between the input and
-//     the actual computation before the first layer. 'skip_input' is only allowed
-//     when input_size == num_units; 'auto_select' implies 'skip_input' when
-//     input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used. Should be
-//   "unidirectional" or "bidirectional".
-// dropout: Dropout probability. When set to 0., dropout is disabled.
-// seed: The 1st part of a seed to initialize dropout.
-// seed2: The 2nd part of a seed to initialize dropout.
-// input: If time_major is true, this is a 3-D tensor with the shape of
-//     [seq_length, batch_size, input_size]. If time_major is false, the shape is
-//     [batch_size, seq_length, input_size].
-// input_h: If time_major is true, this is a 3-D tensor with the shape of
-//     [num_layer * dir, batch_size, num_units]. If time_major is false, the shape
-//     is [batch_size, num_layer * dir, num_units].
-// input_c: For LSTM, a 3-D tensor with the shape of
-//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
-// params: A 1-D tensor that contains the weights and biases in an opaque layout.
-//     The size must be created through CudnnRNNParamsSize, and initialized
-//     separately. Note that they might not be compatible across different
-//     generations. So it is a good idea to save and restore
-// sequence_lengths: a vector of lengths of each input sequence.
-// output: If time_major is true, this is a 3-D tensor with the shape of
-//     [seq_length, batch_size, dir * num_units]. If time_major is false, the
-//     shape is [batch_size, seq_length, dir * num_units].
-// output_h: The same shape has input_h.
-// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
-// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
-//     pass.
-// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
-//     pass.
-// time_major: Indicates whether the input/output format is time major or batch
-//     major.
-// reserve_space: The same reserve_space produced in the forward operation.
-// input_backprop: The backprop to input in the forward pass. Has the same shape
-//     as input.
-// input_h_backprop: The backprop to input_h in the forward pass. Has the same
-//     shape as input_h.
-// input_c_backprop: The backprop to input_c in the forward pass. Has the same
-//     shape as input_c.
-// params_backprop: The backprop to the params buffer in the forward pass. Has the
-//     same shape as params.
-func CudnnRNNBackpropV3(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, sequence_lengths tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, host_reserved tf.Output, optional ...CudnnRNNBackpropV3Attr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CudnnRNNBackpropV3",
-		Input: []tf.Input{
-			input, input_h, input_c, params, sequence_lengths, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space, host_reserved,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// MapClearAttr is an optional argument to MapClear.
-type MapClearAttr func(optionalAttr)
-
-// MapClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapClearCapacity(value int64) MapClearAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapClearMemoryLimit(value int64) MapClearAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapClearContainer(value string) MapClearAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapClearSharedName(value string) MapClearAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes all elements in the underlying container.
-//
-// Returns the created operation.
-func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapClear",
-
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Creates a sequence of numbers.
-//
-// This operation creates a sequence of numbers that begins at `start` and
-// extends by increments of `delta` up to but not including `limit`.
-//
-// For example:
-//
-// ```
-// # 'start' is 3
-// # 'limit' is 18
-// # 'delta' is 3
-// tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
-// ```
-//
-// Arguments:
-//	start: 0-D (scalar). First entry in the sequence.
-//	limit: 0-D (scalar). Upper limit of sequence, exclusive.
-//	delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
-//
-// Returns 1-D.
-func Range(scope *Scope, start tf.Output, limit tf.Output, delta tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Range",
-		Input: []tf.Input{
-			start, limit, delta,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the complex conjugate of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// complex numbers that are the complex conjugate of each element in `input`. The
-// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
-// real part and *b* is the imaginary part.
-//
-// The complex conjugate returned by this operation is of the form \\(a - bj\\).
-//
-// For example:
-//
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
-// ```
-func Conj(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Conj",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// HistogramFixedWidthAttr is an optional argument to HistogramFixedWidth.
-type HistogramFixedWidthAttr func(optionalAttr)
-
-// HistogramFixedWidthDtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_INT32
-func HistogramFixedWidthDtype(value tf.DataType) HistogramFixedWidthAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Return histogram of values.
-//
-// Given the tensor `values`, this operation returns a rank 1 histogram counting
-// the number of entries in `values` that fall into every bin.  The bins are
-// equal width and determined by the arguments `value_range` and `nbins`.
-//
-// ```python
-// # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
-// nbins = 5
-// value_range = [0.0, 5.0]
-// new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
-//
-// with tf.get_default_session() as sess:
-//   hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
-//   variables.global_variables_initializer().run()
-//   sess.run(hist) => [2, 1, 1, 0, 2]
-// ```
-//
-// Arguments:
-//	values: Numeric `Tensor`.
-//	value_range: Shape [2] `Tensor` of same `dtype` as `values`.
-// values <= value_range[0] will be mapped to hist[0],
-// values >= value_range[1] will be mapped to hist[-1].
-//	nbins: Scalar `int32 Tensor`.  Number of histogram bins.
-//
-// Returns A 1-D `Tensor` holding histogram of values.
-func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output, nbins tf.Output, optional ...HistogramFixedWidthAttr) (out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "HistogramFixedWidth",
-		Input: []tf.Input{
-			values, value_range, nbins,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that will write to / read from a snapshot.
-//
-// This dataset attempts to determine whether a valid snapshot exists at the
-// `snapshot_path`, and reads from the snapshot in lieu of using `input_dataset`.
-// If not, it will run the preprocessing pipeline as usual, and write out a
-// snapshot of the data processed for future use.
-//
-// Arguments:
-//	input_dataset: A variant tensor representing the input dataset.
-//	path: The path we should write snapshots to / read snapshots from.
-//
-//
-func SnapshotDataset(scope *Scope, input_dataset tf.Output, path tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "SnapshotDataset",
-		Input: []tf.Input{
-			input_dataset, path,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CumprodAttr is an optional argument to Cumprod.
-type CumprodAttr func(optionalAttr)
-
-// CumprodExclusive sets the optional exclusive attribute to value.
-//
-// value: If `True`, perform exclusive cumprod.
-// If not specified, defaults to false
-func CumprodExclusive(value bool) CumprodAttr {
-	return func(m optionalAttr) {
-		m["exclusive"] = value
-	}
-}
-
-// CumprodReverse sets the optional reverse attribute to value.
-//
-// value: A `bool` (default: False).
-// If not specified, defaults to false
-func CumprodReverse(value bool) CumprodAttr {
-	return func(m optionalAttr) {
-		m["reverse"] = value
-	}
-}
-
-// Compute the cumulative product of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumprod, which means that the first
-// element of the input is identical to the first element of the output:
-//
-// ```python
-// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
-// ```
-//
-// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
-// performed instead:
-//
-// ```python
-// tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
-// ```
-//
-// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
-// opposite direction:
-//
-// ```python
-// tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
-// ```
-//
-// This is more efficient than using separate `tf.reverse` ops.
-//
-// The `reverse` and `exclusive` kwargs can also be combined:
-//
-// ```python
-// tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
-// ```
-//
-// Arguments:
-//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-// `[-rank(x), rank(x))`.
-func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Cumprod",
-		Input: []tf.Input{
-			x, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// NonMaxSuppressionV4Attr is an optional argument to NonMaxSuppressionV4.
-type NonMaxSuppressionV4Attr func(optionalAttr)
-
-// NonMaxSuppressionV4PadToMaxOutputSize sets the optional pad_to_max_output_size attribute to value.
-//
-// value: If true, the output `selected_indices` is padded to be of length
-// `max_output_size`. Defaults to false.
-// If not specified, defaults to false
-func NonMaxSuppressionV4PadToMaxOutputSize(value bool) NonMaxSuppressionV4Attr {
-	return func(m optionalAttr) {
-		m["pad_to_max_output_size"] = value
-	}
-}
-
-// Greedily selects a subset of bounding boxes in descending order of score,
-//
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes with score less than
-// `score_threshold` are removed.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system and more
-// generally is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//   selected_indices = tf.image.non_max_suppression_v2(
-//       boxes, scores, max_output_size, iou_threshold, score_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
-//
-// Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too much with respect to IOU.
-//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
-// boxes based on score.
-//
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.A 0-D integer tensor representing the number of valid elements in
-// `selected_indices`, with the valid elements appearing first.
-func NonMaxSuppressionV4(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output, optional ...NonMaxSuppressionV4Attr) (selected_indices tf.Output, valid_outputs tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionV4",
-		Input: []tf.Input{
-			boxes, scores, max_output_size, iou_threshold, score_threshold,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Convert the quantized 'input' tensor into a lower-precision 'output', using the
-//
-// actual distribution of the values to maximize the usage of the lower bit depth
-// and adjusting the output min and max ranges accordingly.
-//
-// [input_min, input_max] are scalar floats that specify the range for the float
-// interpretation of the 'input' data. For example, if input_min is -1.0f and
-// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
-// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
-//
-// This operator tries to squeeze as much precision as possible into an output with
-// a lower bit depth by calculating the actual min and max values found in the
-// data. For example, maybe that quint16 input has no values lower than 16,384 and
-// none higher than 49,152. That means only half the range is actually needed, all
-// the float interpretations are between -0.5f and 0.5f, so if we want to compress
-// the data into a quint8 output, we can use that range rather than the theoretical
-// -1.0f to 1.0f that is suggested by the input min and max.
-//
-// In practice, this is most useful for taking output from operations like
-// QuantizedMatMul that can produce higher bit-depth outputs than their inputs and
-// may have large potential output ranges, but in practice have a distribution of
-// input values that only uses a small fraction of the possible range. By feeding
-// that output into this operator, we can reduce it from 32 bits down to 8 with
-// minimal loss of accuracy.
-//
-// Arguments:
-//
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//	out_type: The type of the output. Should be a lower bit depth than Tinput.
-//
-// Returns The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"out_type": out_type}
-	opspec := tf.OpSpec{
-		Type: "QuantizeDownAndShrinkRange",
-		Input: []tf.Input{
-			input, input_min, input_max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Creates a dataset that batches and pads `batch_size` elements from the input.
-//
-// Arguments:
-//
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//	padded_shapes: A list of int64 tensors representing the desired padded shapes
-// of the corresponding output components. These shapes may be partially
-// specified, using `-1` to indicate that a particular dimension should be
-// padded to the maximum size of all batch elements.
-//	padding_values: A list of scalars containing the padding value to use for
-// each of the outputs.
-//
-func PaddedBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "PaddedBatchDataset",
-		Input: []tf.Input{
-			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Converts the quantized `input` tensor into a lower-precision `output`.
-//
-// Converts the quantized `input` tensor into a lower-precision `output`, using the
-// output range specified with `requested_output_min` and `requested_output_max`.
-//
-// `[input_min, input_max]` are scalar floats that specify the range for the float
-// interpretation of the `input` data. For example, if `input_min` is -1.0f and
-// `input_max` is 1.0f, and we are dealing with `quint16` quantized data, then a 0
-// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
-//
-// Arguments:
-//
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//	requested_output_min: The float value that the minimum quantized output value represents.
-//	requested_output_max: The float value that the maximum quantized output value represents.
-//	out_type: The type of the output. Should be a lower bit depth than Tinput.
-//
-// Returns The requested_output_min value is copied into this output.The requested_output_max value is copied into this output.
-func Requantize(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"out_type": out_type}
-	opspec := tf.OpSpec{
-		Type: "Requantize",
-		Input: []tf.Input{
-			input, input_min, input_max, requested_output_min, requested_output_max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns true if queue is closed.
-//
-// This operation returns true if the queue is closed and false if the queue
-// is open.
-//
-// Arguments:
-//	handle: The handle to a queue.
-func QueueIsClosedV2(scope *Scope, handle tf.Output) (is_closed tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "QueueIsClosedV2",
-		Input: []tf.Input{
-			handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SetSizeAttr is an optional argument to SetSize.
-type SetSizeAttr func(optionalAttr)
-
-// SetSizeValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SetSizeValidateIndices(value bool) SetSizeAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Number of unique elements along last dimension of input `set`.
-//
-// Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
-// and `set_shape`. The last dimension contains values in a set, duplicates are
-// allowed but ignored.
-//
-// If `validate_indices` is `True`, this op validates the order and range of `set`
-// indices.
-//
-// Arguments:
-//	set_indices: 2D `Tensor`, indices of a `SparseTensor`.
-//	set_values: 1D `Tensor`, values of a `SparseTensor`.
-//	set_shape: 1D `Tensor`, shape of a `SparseTensor`.
-//
-// Returns For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
-// `n-1` dimensions as `set`. Each value is the number of unique elements in
-// the corresponding `[0...n-1]` dimension of `set`.
-func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shape tf.Output, optional ...SetSizeAttr) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SetSize",
-		Input: []tf.Input{
-			set_indices, set_values, set_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
-//
-// Each comparison returns a boolean `true` (if `input_value > threshold`)
-// or and `false` otherwise.
-//
-// This operation is useful for Locality-Sensitive-Hashing (LSH) and other
-// algorithms that use hashing approximations of cosine and `L2` distances;
-// codes can be generated from an input via:
-//
-// ```python
-// codebook_size = 50
-// codebook_bits = codebook_size * 32
-// codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits],
-//                            dtype=x.dtype,
-//                            initializer=tf.orthogonal_initializer())
-// codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)
-// codes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32
-// # now codes has shape x.shape[:-1] + [codebook_size]
-// ```
-//
-// **NOTE**: Currently, the innermost dimension of the tensor must be divisible
-// by 8.
-//
-// Given an `input` shaped `[s0, s1, ..., s_n]`, the output is
-// a `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`.
-//
-// Arguments:
-//	input: Values to compare against `threshold` and bitpack.
-//	threshold: Threshold to compare against.
-//
-// Returns The bitpacked comparisons.
-func CompareAndBitpack(scope *Scope, input tf.Output, threshold tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "CompareAndBitpack",
-		Input: []tf.Input{
-			input, threshold,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Rolls the elements of a tensor along an axis.
-//
-// The elements are shifted positively (towards larger indices) by the offset of
-// `shift` along the dimension of `axis`. Negative `shift` values will shift
-// elements in the opposite direction. Elements that roll passed the last position
-// will wrap around to the first and vice versa. Multiple shifts along multiple
-// axes may be specified.
-//
-// For example:
-//
-// ```
-// # 't' is [0, 1, 2, 3, 4]
-// roll(t, shift=2, axis=0) ==> [3, 4, 0, 1, 2]
-//
-// # shifting along multiple dimensions
-// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
-// roll(t, shift=[1, -2], axis=[0, 1]) ==> [[7, 8, 9, 5, 6], [2, 3, 4, 0, 1]]
-//
-// # shifting along the same axis multiple times
-// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
-// roll(t, shift=[2, -3], axis=[1, 1]) ==> [[1, 2, 3, 4, 0], [6, 7, 8, 9, 5]]
-// ```
-//
-// Arguments:
-//
-//	shift: Dimension must be 0-D or 1-D. `shift[i]` specifies the number of places by which
-// elements are shifted positively (towards larger indices) along the dimension
-// specified by `axis[i]`. Negative shifts will roll the elements in the opposite
-// direction.
-//	axis: Dimension must be 0-D or 1-D. `axis[i]` specifies the dimension that the shift
-// `shift[i]` should occur. If the same axis is referenced more than once, the
-// total shift for that axis will be the sum of all the shifts that belong to that
-// axis.
-//
-// Returns Has the same shape and size as the input. The elements are shifted
-// positively (towards larger indices) by the offsets of `shift` along the
-// dimensions of `axis`.
-func Roll(scope *Scope, input tf.Output, shift tf.Output, axis tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Roll",
-		Input: []tf.Input{
-			input, shift, axis,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizedAddAttr is an optional argument to QuantizedAdd.
-type QuantizedAddAttr func(optionalAttr)
-
-// QuantizedAddToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedAddToutput(value tf.DataType) QuantizedAddAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
-}
-
-// Returns x + y element-wise, working on quantized buffers.
-//
-// Arguments:
-//
-//
-//	min_x: The float value that the lowest quantized `x` value represents.
-//	max_x: The float value that the highest quantized `x` value represents.
-//	min_y: The float value that the lowest quantized `y` value represents.
-//	max_y: The float value that the highest quantized `y` value represents.
-//
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-//
-// *NOTE*: `QuantizedAdd` supports limited forms of broadcasting. More about
-// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func QuantizedAdd(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedAddAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedAdd",
-		Input: []tf.Input{
-			x, y, min_x, max_x, min_y, max_y,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Bucketizes 'input' based on 'boundaries'.
-//
-// For example, if the inputs are
-//     boundaries = [0, 10, 100]
-//     input = [[-5, 10000]
-//              [150,   10]
-//              [5,    100]]
-//
-// then the output will be
-//     output = [[0, 3]
-//               [3, 2]
-//               [1, 3]]
-//
-// Arguments:
-//	input: Any shape of Tensor contains with int or float type.
-//	boundaries: A sorted list of floats gives the boundary of the buckets.
-//
-// Returns Same shape with 'input', each value of input replaced with bucket index.
-//
-// @compatibility(numpy)
-// Equivalent to np.digitize.
-// @end_compatibility
-func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"boundaries": boundaries}
-	opspec := tf.OpSpec{
-		Type: "Bucketize",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes requantization range per channel.
-//
-// Arguments:
-//	input: The original input tensor.
-//	input_min: The minimum value of the input tensor
-//	input_max: The maximum value of the input tensor.
-//	clip_value_max: The maximum value of the output that needs to be clipped.
-// Example: set this to 6 for Relu6.
-//
-// Returns The minimum value of the final output tensorThe maximum value of the final output tensor.
-func RequantizationRangePerChannel(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, clip_value_max float32) (output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"clip_value_max": clip_value_max}
-	opspec := tf.OpSpec{
-		Type: "RequantizationRangePerChannel",
-		Input: []tf.Input{
-			input, input_min, input_max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// EnterAttr is an optional argument to Enter.
-type EnterAttr func(optionalAttr)
-
-// EnterIsConstant sets the optional is_constant attribute to value.
-//
-// value: If true, the output is constant within the child frame.
-// If not specified, defaults to false
-func EnterIsConstant(value bool) EnterAttr {
-	return func(m optionalAttr) {
-		m["is_constant"] = value
-	}
-}
-
-// EnterParallelIterations sets the optional parallel_iterations attribute to value.
-//
-// value: The number of iterations allowed to run in parallel.
-// If not specified, defaults to 10
-func EnterParallelIterations(value int64) EnterAttr {
-	return func(m optionalAttr) {
-		m["parallel_iterations"] = value
-	}
-}
-
-// Creates or finds a child frame, and makes `data` available to the child frame.
-//
-// This op is used together with `Exit` to create loops in the graph.
-// The unique `frame_name` is used by the `Executor` to identify frames. If
-// `is_constant` is true, `output` is a constant in the child frame; otherwise
-// it may be changed in the child frame. At most `parallel_iterations` iterations
-// are run in parallel in the child frame.
-//
-// Arguments:
-//	data: The tensor to be made available to the child frame.
-//	frame_name: The name of the child frame.
-//
-// Returns The same tensor as `data`.
-func Enter(scope *Scope, data tf.Output, frame_name string, optional ...EnterAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"frame_name": frame_name}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Enter",
-		Input: []tf.Input{
-			data,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the next representable value of `x1` in the direction of `x2`, element-wise.
-//
-// This operation returns the same result as the C++ std::nextafter function.
-//
-// It can also return a subnormal number.
-//
-// @compatibility(cpp)
-// Equivalent to C++ std::nextafter function.
-// @end_compatibility
-func NextAfter(scope *Scope, x1 tf.Output, x2 tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "NextAfter",
-		Input: []tf.Input{
-			x1, x2,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Outputs random integers from a uniform distribution.
-//
-// The generated values are uniform integers in the range `[minval, maxval)`.
-// The lower bound `minval` is included in the range, while the upper bound
-// `maxval` is excluded.
-//
-// The random integers are slightly biased unless `maxval - minval` is an exact
-// power of two.  The bias is small for values of `maxval - minval` significantly
-// smaller than the range of the output (either `2^32` or `2^64`).
-//
-// Arguments:
-//	resource: The handle of the resource variable that stores the state of the RNG.
-//	algorithm: The RNG algorithm.
-//	shape: The shape of the output tensor.
-//	minval: Minimum value (inclusive, scalar).
-//	maxval: Maximum value (exclusive, scalar).
-//
-// Returns Random values with specified shape.
-func StatefulUniformInt(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "StatefulUniformInt",
-		Input: []tf.Input{
-			resource, algorithm, shape, minval, maxval,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Looks up keys in a table, outputs the corresponding values.
-//
-// The tensor `keys` must of the same type as the keys of the table.
-// The output `values` is of the type of the table values.
-//
-// The scalar `default_value` is the value output for keys not present in the
-// table. It must also be of the same type as the table values.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
-//
-//
-// Returns Same shape as `keys`.  Values found in the table, or `default_values`
-// for missing keys.
-func LookupTableFindV2(scope *Scope, table_handle tf.Output, keys tf.Output, default_value tf.Output) (values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LookupTableFindV2",
-		Input: []tf.Input{
-			table_handle, keys, default_value,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Removes keys and its associated values from a table.
-//
-// The tensor `keys` must of the same type as the keys of the table. Keys not
-// already in the table are silently ignored.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys of the elements to remove.
-//
-// Returns the created operation.
-func LookupTableRemoveV2(scope *Scope, table_handle tf.Output, keys tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LookupTableRemoveV2",
-		Input: []tf.Input{
-			table_handle, keys,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes the number of elements in the given table.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//
-// Returns Scalar that contains number of elements in the table.
-func LookupTableSizeV2(scope *Scope, table_handle tf.Output) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LookupTableSizeV2",
-		Input: []tf.Input{
-			table_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// PrintAttr is an optional argument to Print.
-type PrintAttr func(optionalAttr)
-
-// PrintMessage sets the optional message attribute to value.
-//
-// value: A string, prefix of the error message.
-// If not specified, defaults to ""
-func PrintMessage(value string) PrintAttr {
-	return func(m optionalAttr) {
-		m["message"] = value
-	}
-}
-
-// PrintFirstN sets the optional first_n attribute to value.
-//
-// value: Only log `first_n` number of times. -1 disables logging.
-// If not specified, defaults to -1
-func PrintFirstN(value int64) PrintAttr {
-	return func(m optionalAttr) {
-		m["first_n"] = value
-	}
-}
-
-// PrintSummarize sets the optional summarize attribute to value.
-//
-// value: Only print this many entries of each tensor.
-// If not specified, defaults to 3
-func PrintSummarize(value int64) PrintAttr {
-	return func(m optionalAttr) {
-		m["summarize"] = value
-	}
-}
-
-// Prints a list of tensors.
-//
-// Passes `input` through to `output` and prints `data` when evaluating.
-//
-// Arguments:
-//	input: The tensor passed to `output`
-//	data: A list of tensors to print out when op is evaluated.
-//
-// Returns = The unmodified `input` tensor
-func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Print",
-		Input: []tf.Input{
-			input, tf.OutputList(data),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Selects the k nearest centers for each point.
-//
-// Rows of points are assumed to be input points. Rows of centers are assumed to be
-// the list of candidate centers. For each point, the k centers that have least L2
-// distance to it are computed.
-//
-// Arguments:
-//	points: Matrix of shape (n, d). Rows are assumed to be input points.
-//	centers: Matrix of shape (m, d). Rows are assumed to be centers.
-//	k: Number of nearest centers to return for each point. If k is larger than m, then
-// only m centers are returned.
-//
-// Returns Matrix of shape (n, min(m, k)). Each row contains the indices of the centers
-// closest to the corresponding point, ordered by increasing distance.Matrix of shape (n, min(m, k)). Each row contains the squared L2 distance to the
-// corresponding center in nearest_center_indices.
-func NearestNeighbors(scope *Scope, points tf.Output, centers tf.Output, k tf.Output) (nearest_center_indices tf.Output, nearest_center_distances tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "NearestNeighbors",
-		Input: []tf.Input{
-			points, centers, k,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// ComplexAbsAttr is an optional argument to ComplexAbs.
-type ComplexAbsAttr func(optionalAttr)
-
-// ComplexAbsTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Computes the complex absolute value of a tensor.
-//
-// Given a tensor `x` of complex numbers, this operation returns a tensor of type
-// `float` or `double` that is the absolute value of each element in `x`. All
-// elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
-// value is computed as \\( \sqrt{a^2 + b^2}\\).
-func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ComplexAbs",
-		Input: []tf.Input{
-			x,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Replaces the contents of the table with the specified keys and values.
-//
-// The tensor `keys` must be of the same type as the keys of the table.
-// The tensor `values` must be of the type of the table values.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
-//	values: Values to associate with keys.
-//
-// Returns the created operation.
-func LookupTableImportV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LookupTableImportV2",
-		Input: []tf.Input{
-			table_handle, keys, values,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// HashTableV2Attr is an optional argument to HashTableV2.
-type HashTableV2Attr func(optionalAttr)
-
-// HashTableV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func HashTableV2Container(value string) HashTableV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// HashTableV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func HashTableV2SharedName(value string) HashTableV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// HashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-//
-// value: If true and shared_name is empty, the table is shared
-// using the node name.
-// If not specified, defaults to false
-func HashTableV2UseNodeNameSharing(value bool) HashTableV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
-	}
-}
-
-// Creates a non-initialized hash table.
-//
-// This op creates a hash table, specifying the type of its keys and values.
-// Before using the table you will have to initialize it.  After initialization the
-// table will be immutable.
-//
-// Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
-//
-// Returns Handle to a table.
-func HashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...HashTableV2Attr) (table_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "HashTableV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MutableHashTableV2Attr is an optional argument to MutableHashTableV2.
-type MutableHashTableV2Attr func(optionalAttr)
-
-// MutableHashTableV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutableHashTableV2Container(value string) MutableHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MutableHashTableV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func MutableHashTableV2SharedName(value string) MutableHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// MutableHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-//
-// value: If true and shared_name is empty, the table is shared
-// using the node name.
-// If not specified, defaults to false
-func MutableHashTableV2UseNodeNameSharing(value bool) MutableHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
-	}
-}
-
-// Creates an empty hash table.
-//
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a scalar. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
-//
-// Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
-//
-// Returns Handle to a table.
-func MutableHashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableV2Attr) (table_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MutableHashTableV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DequantizeAttr is an optional argument to Dequantize.
-type DequantizeAttr func(optionalAttr)
-
-// DequantizeMode sets the optional mode attribute to value.
-// If not specified, defaults to "MIN_COMBINED"
-func DequantizeMode(value string) DequantizeAttr {
-	return func(m optionalAttr) {
-		m["mode"] = value
-	}
-}
-
-// Dequantize the 'input' tensor into a float Tensor.
-//
-// [min_range, max_range] are scalar floats that specify the range for
-// the 'input' data. The 'mode' attribute controls exactly which calculations are
-// used to convert the float values to their quantized equivalents.
-//
-// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
-//
-// ```
-// if T == qint8: in[i] += (range(T) + 1)/ 2.0
-// out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
-// ```
-// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-//
-// *MIN_COMBINED Mode Example*
-//
-// If the input comes from a QuantizedRelu6, the output type is
-// quint8 (range of 0-255) but the possible range of QuantizedRelu6 is
-// 0-6.  The min_range and max_range values are therefore 0.0 and 6.0.
-// Dequantize on quint8 will take each value, cast to float, and multiply
-// by 6 / 255.
-// Note that if quantizedtype is qint8, the operation will additionally add
-// each value by 128 prior to casting.
-//
-// If the mode is 'MIN_FIRST', then this approach is used:
-//
-// ```c++
-// num_discrete_values = 1 << (# of bits in T)
-// range_adjust = num_discrete_values / (num_discrete_values - 1)
-// range = (range_max - range_min) * range_adjust
-// range_scale = range / num_discrete_values
-// const double offset_input = static_cast<double>(input) - lowest_quantized;
-// result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
-// ```
-//
-// *SCALED mode Example*
-//
-// `SCALED` mode matches the quantization approach used in
-// `QuantizeAndDequantize{V2|V3}`.
-//
-// If the mode is `SCALED`, we do not use the full range of the output type,
-// choosing to elide the lowest possible value for symmetry (e.g., output range is
-// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
-// 0.
-//
-// We first find the range of values in our tensor. The
-// range we use is always centered on 0, so we find m such that
-// ```c++
-//   m = max(abs(input_min), abs(input_max))
-// ```
-//
-// Our input tensor range is then `[-m, m]`.
-//
-// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
-// If T is signed, this is
-// ```
-//   num_bits = sizeof(T) * 8
-//   [min_fixed, max_fixed] =
-//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
-// ```
-//
-// Otherwise, if T is unsigned, the fixed-point range is
-// ```
-//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
-// ```
-//
-// From this we compute our scaling factor, s:
-// ```c++
-//   s = (2 * m) / (max_fixed - min_fixed)
-// ```
-//
-// Now we can dequantize the elements of our tensor:
-// ```c++
-// result = input * s
-// ```
-//
-// Arguments:
-//
-//	min_range: The minimum scalar value possibly produced for the input.
-//	max_range: The maximum scalar value possibly produced for the input.
-func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, optional ...DequantizeAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Dequantize",
-		Input: []tf.Input{
-			input, min_range, max_range,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns x // y element-wise.
-//
-// *NOTE*: `FloorDiv` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func FloorDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "FloorDiv",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the sum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \sum_j data_j\\) where sum is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
-// </div>
-//
-// For example:
-//
-// ```
-// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
-// tf.segment_sum(c, tf.constant([0, 0, 1]))
-// # ==> [[5, 5, 5, 5],
-// #      [5, 6, 7, 8]]
-// ```
-//
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SegmentSum",
-		Input: []tf.Input{
-			data, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes tan of x element-wise.
-func Tan(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Tan",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MutableHashTableOfTensorsV2Attr is an optional argument to MutableHashTableOfTensorsV2.
-type MutableHashTableOfTensorsV2Attr func(optionalAttr)
-
-// MutableHashTableOfTensorsV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutableHashTableOfTensorsV2Container(value string) MutableHashTableOfTensorsV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MutableHashTableOfTensorsV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func MutableHashTableOfTensorsV2SharedName(value string) MutableHashTableOfTensorsV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// MutableHashTableOfTensorsV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-// If not specified, defaults to false
-func MutableHashTableOfTensorsV2UseNodeNameSharing(value bool) MutableHashTableOfTensorsV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
-	}
-}
-
-// MutableHashTableOfTensorsV2ValueShape sets the optional value_shape attribute to value.
-// If not specified, defaults to <>
-func MutableHashTableOfTensorsV2ValueShape(value tf.Shape) MutableHashTableOfTensorsV2Attr {
-	return func(m optionalAttr) {
-		m["value_shape"] = value
-	}
-}
-
-// Creates an empty hash table.
-//
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a vector. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
-//
-// Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
-//
-// Returns Handle to a table.
-func MutableHashTableOfTensorsV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableOfTensorsV2Attr) (table_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MutableHashTableOfTensorsV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that contains `rate` elements from the `input_dataset`.
-//
-// Arguments:
-//
-//	rate: A scalar representing the sample rate of elements from the `input_dataset`
-// that should be taken.
-//	seed: A scalar representing seed of random number generator.
-//	seed2: A scalar representing seed2 of random number generator.
-//
-//
-func SamplingDataset(scope *Scope, input_dataset tf.Output, rate tf.Output, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "SamplingDataset",
-		Input: []tf.Input{
-			input_dataset, rate, seed, seed2,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// EuclideanNormAttr is an optional argument to EuclideanNorm.
-type EuclideanNormAttr func(optionalAttr)
-
-// EuclideanNormKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func EuclideanNormKeepDims(value bool) EuclideanNormAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the euclidean norm of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func EuclideanNorm(scope *Scope, input tf.Output, axis tf.Output, optional ...EuclideanNormAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EuclideanNorm",
-		Input: []tf.Input{
-			input, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StackPushV2Attr is an optional argument to StackPushV2.
-type StackPushV2Attr func(optionalAttr)
-
-// StackPushV2SwapMemory sets the optional swap_memory attribute to value.
-//
-// value: Swap `elem` to CPU. Default to false.
-// If not specified, defaults to false
-func StackPushV2SwapMemory(value bool) StackPushV2Attr {
-	return func(m optionalAttr) {
-		m["swap_memory"] = value
-	}
-}
-
-// Push an element onto the stack.
-//
-// Arguments:
-//	handle: The handle to a stack.
-//	elem: The tensor to be pushed onto the stack.
-//
-// Returns The same tensor as the input 'elem'.
-func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...StackPushV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StackPushV2",
-		Input: []tf.Input{
-			handle, elem,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SparseMatMulAttr is an optional argument to SparseMatMul.
-type SparseMatMulAttr func(optionalAttr)
-
-// SparseMatMulTransposeA sets the optional transpose_a attribute to value.
-// If not specified, defaults to false
-func SparseMatMulTransposeA(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
-}
-
-// SparseMatMulTransposeB sets the optional transpose_b attribute to value.
-// If not specified, defaults to false
-func SparseMatMulTransposeB(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// SparseMatMulAIsSparse sets the optional a_is_sparse attribute to value.
-// If not specified, defaults to false
-func SparseMatMulAIsSparse(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["a_is_sparse"] = value
-	}
-}
-
-// SparseMatMulBIsSparse sets the optional b_is_sparse attribute to value.
-// If not specified, defaults to false
-func SparseMatMulBIsSparse(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["b_is_sparse"] = value
-	}
-}
-
-// Multiply matrix "a" by matrix "b".
-//
-// The inputs must be two-dimensional matrices and the inner dimension of "a" must
-// match the outer dimension of "b". Both "a" and "b" must be `Tensor`s not
-// `SparseTensor`s.  This op is optimized for the case where at least one of "a" or
-// "b" is sparse, in the sense that they have a large proportion of zero values.
-// The breakeven for using this versus a dense matrix multiply on one platform was
-// 30% zero values in the sparse matrix.
-//
-// The gradient computation of this operation will only take advantage of sparsity
-// in the input gradient when that gradient comes from a Relu.
-func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatMulAttr) (product tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseMatMul",
-		Input: []tf.Input{
-			a, b,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// InitializeTableFromTextFileV2Attr is an optional argument to InitializeTableFromTextFileV2.
-type InitializeTableFromTextFileV2Attr func(optionalAttr)
-
-// InitializeTableFromTextFileV2VocabSize sets the optional vocab_size attribute to value.
-//
-// value: Number of elements of the file, use -1 if unknown.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func InitializeTableFromTextFileV2VocabSize(value int64) InitializeTableFromTextFileV2Attr {
-	return func(m optionalAttr) {
-		m["vocab_size"] = value
-	}
-}
-
-// InitializeTableFromTextFileV2Delimiter sets the optional delimiter attribute to value.
-//
-// value: Delimiter to separate fields in a line.
-// If not specified, defaults to "\t"
-func InitializeTableFromTextFileV2Delimiter(value string) InitializeTableFromTextFileV2Attr {
-	return func(m optionalAttr) {
-		m["delimiter"] = value
-	}
-}
-
-// Initializes a table from a text file.
-//
-// It inserts one key-value pair into the table for each line of the file.
-// The key and value is extracted from the whole line content, elements from the
-// split line based on `delimiter` or the line number (starting from zero).
-// Where to extract the key and value from a line is specified by `key_index` and
-// `value_index`.
-//
-// - A value of -1 means use the line number(starting from zero), expects `int64`.
-// - A value of -2 means use the whole line content, expects `string`.
-// - A value >= 0 means use the index (starting at zero) of the split line based
-//   on `delimiter`.
-//
-// Arguments:
-//	table_handle: Handle to a table which will be initialized.
-//	filename: Filename of a vocabulary text file.
-//	key_index: Column index in a line to get the table `key` values from.
-//	value_index: Column index that represents information of a line to get the table
-// `value` values from.
-//
-// Returns the created operation.
-func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filename tf.Output, key_index int64, value_index int64, optional ...InitializeTableFromTextFileV2Attr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"key_index": key_index, "value_index": value_index}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "InitializeTableFromTextFileV2",
-		Input: []tf.Input{
-			table_handle, filename,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// PrintV2Attr is an optional argument to PrintV2.
-type PrintV2Attr func(optionalAttr)
-
-// PrintV2OutputStream sets the optional output_stream attribute to value.
-//
-// value: A string specifying the output stream or logging level to print to.
-// If not specified, defaults to "stderr"
-func PrintV2OutputStream(value string) PrintV2Attr {
-	return func(m optionalAttr) {
-		m["output_stream"] = value
-	}
-}
-
-// PrintV2End sets the optional end attribute to value.
-// If not specified, defaults to "\n"
-func PrintV2End(value string) PrintV2Attr {
-	return func(m optionalAttr) {
-		m["end"] = value
-	}
-}
-
-// Prints a string scalar.
-//
-// Prints a string scalar to the desired output_stream.
-//
-// Arguments:
-//	input: The string scalar to print.
-//
-// Returns the created operation.
-func PrintV2(scope *Scope, input tf.Output, optional ...PrintV2Attr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "PrintV2",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// ImageSummaryAttr is an optional argument to ImageSummary.
-type ImageSummaryAttr func(optionalAttr)
-
-// ImageSummaryMaxImages sets the optional max_images attribute to value.
-//
-// value: Max number of batch elements to generate images for.
-// If not specified, defaults to 3
-//
-// REQUIRES: value >= 1
-func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
-	return func(m optionalAttr) {
-		m["max_images"] = value
-	}
-}
-
-// ImageSummaryBadColor sets the optional bad_color attribute to value.
-//
-// value: Color to use for pixels with non-finite values.
-// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
-func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
-	return func(m optionalAttr) {
-		m["bad_color"] = value
-	}
-}
-
-// Outputs a `Summary` protocol buffer with images.
-//
-// The summary has up to `max_images` summary values containing images. The
-// images are built from `tensor` which must be 4-D with shape `[batch_size,
-// height, width, channels]` and where `channels` can be:
-//
-// *  1: `tensor` is interpreted as Grayscale.
-// *  3: `tensor` is interpreted as RGB.
-// *  4: `tensor` is interpreted as RGBA.
-//
-// The images have the same number of channels as the input tensor. For float
-// input, the values are normalized one image at a time to fit in the range
-// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
-// normalization algorithms:
-//
-// *  If the input values are all positive, they are rescaled so the largest one
-//    is 255.
-//
-// *  If any input value is negative, the values are shifted so input value 0.0
-//    is at 127.  They are then rescaled so that either the smallest value is 0,
-//    or the largest one is 255.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
-// *  If `max_images` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
-//
-// The `bad_color` argument is the color to use in the generated images for
-// non-finite input values.  It is a `uint8` 1-D tensor of length `channels`.
-// Each element must be in the range `[0, 255]` (It represents the value of a
-// pixel in the output image).  Non-finite values in the input tensor are
-// replaced by this tensor in the output image.  The default value is the color
-// red.
-//
-// Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
-// `channels` is 1, 3, or 4.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...ImageSummaryAttr) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ImageSummary",
-		Input: []tf.Input{
-			tag, tensor,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AudioSummaryV2Attr is an optional argument to AudioSummaryV2.
-type AudioSummaryV2Attr func(optionalAttr)
-
-// AudioSummaryV2MaxOutputs sets the optional max_outputs attribute to value.
-//
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
-//
-// REQUIRES: value >= 1
-func AudioSummaryV2MaxOutputs(value int64) AudioSummaryV2Attr {
-	return func(m optionalAttr) {
-		m["max_outputs"] = value
-	}
-}
-
-// Outputs a `Summary` protocol buffer with audio.
-//
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
-//
-// Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...AudioSummaryV2Attr) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AudioSummaryV2",
-		Input: []tf.Input{
-			tag, tensor, sample_rate,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Compute the polygamma function \\(\psi^{(n)}(x)\\).
-//
-// The polygamma function is defined as:
-//
-//
-// \\(\psi^{(a)}(x) = \frac{d^a}{dx^a} \psi(x)\\)
-//
-// where \\(\psi(x)\\) is the digamma function.
-// The polygamma function is defined only for non-negative integer orders \\a\\.
-func Polygamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Polygamma",
-		Input: []tf.Input{
-			a, x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AudioSummaryAttr is an optional argument to AudioSummary.
-type AudioSummaryAttr func(optionalAttr)
-
-// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
-//
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
-//
-// REQUIRES: value >= 1
-func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
-	return func(m optionalAttr) {
-		m["max_outputs"] = value
-	}
-}
-
-// Outputs a `Summary` protocol buffer with audio.
-//
-// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
-//
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
-//
-// Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"sample_rate": sample_rate}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AudioSummary",
-		Input: []tf.Input{
-			tag, tensor,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Encode audio data using the WAV file format.
-//
-// This operation will generate a string suitable to be saved out to create a .wav
-// audio file. It will be encoded in the 16-bit PCM format. It takes in float
-// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
-// that range.
-//
-// `audio` is a 2-D float Tensor of shape `[length, channels]`.
-// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
-//
-// Arguments:
-//	audio: 2-D with shape `[length, channels]`.
-//	sample_rate: Scalar containing the sample frequency.
-//
-// Returns 0-D. WAV-encoded file contents.
-func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "EncodeWav",
-		Input: []tf.Input{
-			audio, sample_rate,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a Dataset that returns pseudorandom numbers.
-//
-// Arguments:
-//	seed: A scalar seed for the random number generator. If either seed or
-// seed2 is set to be non-zero, the random number generator is seeded
-// by the given seed.  Otherwise, a random seed is used.
-//	seed2: A second scalar seed to avoid seed collision.
-//
-//
-func ExperimentalRandomDataset(scope *Scope, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalRandomDataset",
-		Input: []tf.Input{
-			seed, seed2,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TextLineReaderV2Attr is an optional argument to TextLineReaderV2.
-type TextLineReaderV2Attr func(optionalAttr)
-
-// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value.
-//
-// value: Number of lines to skip from the beginning of every file.
-// If not specified, defaults to 0
-func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr {
-	return func(m optionalAttr) {
-		m["skip_header_lines"] = value
-	}
-}
-
-// TextLineReaderV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func TextLineReaderV2Container(value string) TextLineReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// TextLineReaderV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A Reader that outputs the lines of a file delimited by '\n'.
-//
-// Returns The handle to reference the Reader.
-func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TextLineReaderV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates and returns an empty tensor list.
-//
-// All list elements must be tensors of dtype element_dtype and shape compatible
-// with element_shape.
-//
-// handle: an empty tensor list.
-// element_dtype: the type of elements in the list.
-// element_shape: a shape compatible with that of elements in the list.
-func EmptyTensorList(scope *Scope, element_shape tf.Output, max_num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	opspec := tf.OpSpec{
-		Type: "EmptyTensorList",
-		Input: []tf.Input{
-			element_shape, max_num_elements,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the number of tensors in the input tensor list.
-//
-// input_handle: the input list
-// length: the number of tensors in the list
-func TensorListLength(scope *Scope, input_handle tf.Output) (length tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListLength",
-		Input: []tf.Input{
-			input_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Outputs all keys and values in the table.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//
-//
-//
-// Returns Vector of all keys present in the table.Tensor of all values in the table. Indexed in parallel with `keys`.
-func LookupTableExportV2(scope *Scope, table_handle tf.Output, Tkeys tf.DataType, Tvalues tf.DataType) (keys tf.Output, values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"Tkeys": Tkeys, "Tvalues": Tvalues}
-	opspec := tf.OpSpec{
-		Type: "LookupTableExportV2",
-		Input: []tf.Input{
-			table_handle,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Returns the last element of the input list as well as a list with all but that element.
-//
-// Fails if the list is empty.
-//
-// input_handle: the input list
-// tensor: the withdrawn last element of the list
-// element_dtype: the type of elements in the list
-// element_shape: the shape of the output tensor
-func TensorListPopBack(scope *Scope, input_handle tf.Output, element_shape tf.Output, element_dtype tf.DataType) (output_handle tf.Output, tensor tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorListPopBack",
-		Input: []tf.Input{
-			input_handle, element_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// TensorListConcatAttr is an optional argument to TensorListConcat.
-type TensorListConcatAttr func(optionalAttr)
-
-// TensorListConcatElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorListConcatElementShape(value tf.Shape) TensorListConcatAttr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
-	}
-}
-
-// Concats all tensors in the list along the 0th dimension.
-//
-// Requires that all tensors have the same shape except the first dimension.
-//
-// input_handle: The input list.
-// tensor: The concated result.
-// lengths: Output tensor containing sizes of the 0th dimension of tensors in the list, used for computing the gradient.
-//
-func TensorListConcat(scope *Scope, input_handle tf.Output, element_dtype tf.DataType, optional ...TensorListConcatAttr) (tensor tf.Output, lengths tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListConcat",
-		Input: []tf.Input{
-			input_handle,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// SpaceToBatch for 4-D tensors of type T.
-//
-// This is a legacy version of the more general SpaceToBatchND.
-//
-// Zero-pads and then rearranges (permutes) blocks of spatial data into batch.
-// More specifically, this op outputs a copy of the input tensor where values from
-// the `height` and `width` dimensions are moved to the `batch` dimension. After
-// the zero-padding, both `height` and `width` of the input must be divisible by the
-// block size.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, height, width, depth]`.
-//	paddings: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
-//   the padding of the input with zeros across the spatial dimensions as follows:
-//
-//       paddings = [[pad_top, pad_bottom], [pad_left, pad_right]]
-//
-//   The effective spatial dimensions of the zero-padded input tensor will be:
-//
-//       height_pad = pad_top + height + pad_bottom
-//       width_pad = pad_left + width + pad_right
-//
-// The attr `block_size` must be greater than one. It indicates the block size.
-//
-//   * Non-overlapping blocks of size `block_size x block size` in the height and
-//     width dimensions are rearranged into the batch dimension at each location.
-//   * The batch of the output tensor is `batch * block_size * block_size`.
-//   * Both height_pad and width_pad must be divisible by block_size.
-//
-// The shape of the output will be:
-//
-//     [batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
-//      depth]
-//
-// Some examples:
-//
-// (1) For the following input of shape `[1, 2, 2, 1]` and block_size of 2:
-//
-// ```
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
-//
-// The output tensor has shape `[4, 1, 1, 1]` and value:
-//
-// ```
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-// ```
-//
-// (2) For the following input of shape `[1, 2, 2, 3]` and block_size of 2:
-//
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
-//
-// The output tensor has shape `[4, 1, 1, 3]` and value:
-//
-// ```
-// [[[[1, 2, 3]]], [[[4, 5, 6]]], [[[7, 8, 9]]], [[[10, 11, 12]]]]
-// ```
-//
-// (3) For the following input of shape `[1, 4, 4, 1]` and block_size of 2:
-//
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]],
-//       [[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
-//
-// The output tensor has shape `[4, 2, 2, 1]` and value:
-//
-// ```
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-//
-// (4) For the following input of shape `[2, 2, 4, 1]` and block_size of 2:
-//
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]]],
-//      [[[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
-//
-// The output tensor has shape `[8, 1, 2, 1]` and value:
-//
-// ```
-// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
-//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
-// ```
-//
-// Among others, this operation is useful for reducing atrous convolution into
-// regular convolution.
-//
-func SpaceToBatch(scope *Scope, input tf.Output, paddings tf.Output, block_size int64) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"block_size": block_size}
-	opspec := tf.OpSpec{
-		Type: "SpaceToBatch",
-		Input: []tf.Input{
-			input, paddings,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns element-wise smallest integer not less than x.
-func Ceil(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Ceil",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// The shape of the elements of the given list, as a tensor.
-//
-//   input_handle: the list
-//   element_shape: the shape of elements of the list
-func TensorListElementShape(scope *Scope, input_handle tf.Output, shape_type tf.DataType) (element_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shape_type": shape_type}
-	opspec := tf.OpSpec{
-		Type: "TensorListElementShape",
-		Input: []tf.Input{
-			input_handle,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Generates values in an interval.
-//
-// A sequence of `num` evenly-spaced values are generated beginning at `start`.
-// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
-// so that the last one is exactly `stop`.
-//
-// For example:
-//
-// ```
-// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
-// ```
-//
-// Arguments:
-//	start: 0-D tensor. First entry in the range.
-//	stop: 0-D tensor. Last entry in the range.
-//	num: 0-D tensor. Number of values to generate.
-//
-// Returns 1-D. The generated values.
-func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LinSpace",
-		Input: []tf.Input{
-			start, stop, num,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// List of the given size with empty elements.
-//
-// element_shape: the shape of the future elements of the list
-// num_elements: the number of elements to reserve
-// handle: the output list
-// element_dtype: the desired type of elements in the list.
-func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorListReserve",
-		Input: []tf.Input{
-			element_shape, num_elements,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
-type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
-
-// ResourceSparseApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyFtrlV2UseLocking(value bool) ResourceSparseApplyFtrlV2Attr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
-//
-// That is for rows we have grad for, we update var, accum and linear as follows:
-// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-// linear += grad_with_shrinkage +
-//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 shrinkage regulariation. Must be a scalar.
-//
-//	lr_power: Scaling factor. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlV2Attr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyFtrlV2",
-		Input: []tf.Input{
-			var_, accum, linear, grad, indices, lr, l1, l2, l2_shrinkage, lr_power,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// ExperimentalStatsAggregatorHandleAttr is an optional argument to ExperimentalStatsAggregatorHandle.
-type ExperimentalStatsAggregatorHandleAttr func(optionalAttr)
-
-// ExperimentalStatsAggregatorHandleContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func ExperimentalStatsAggregatorHandleContainer(value string) ExperimentalStatsAggregatorHandleAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// ExperimentalStatsAggregatorHandleSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func ExperimentalStatsAggregatorHandleSharedName(value string) ExperimentalStatsAggregatorHandleAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Creates a statistics manager resource.
-func ExperimentalStatsAggregatorHandle(scope *Scope, optional ...ExperimentalStatsAggregatorHandleAttr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalStatsAggregatorHandle",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the item in the list with the given index.
-//
-// input_handle: the list
-// index: the position in the list from which an element will be retrieved
-// item: the element at that position
-//
-//
-func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, element_shape tf.Output, element_dtype tf.DataType) (item tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorListGetItem",
-		Input: []tf.Input{
-			input_handle, index, element_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Resizes the list.
-//
-//
-// input_handle: the input list
-// size: size of the output list
-//
-func TensorListResize(scope *Scope, input_handle tf.Output, size tf.Output) (output_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListResize",
-		Input: []tf.Input{
-			input_handle, size,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Sets the index-th position of the list to contain the given tensor.
-//
-// input_handle: the list
-// index: the position in the list to which the tensor will be assigned
-// item: the element to be assigned to that position
-// output_handle: the new list, with the element in the proper position
-//
-func TensorListSetItem(scope *Scope, input_handle tf.Output, index tf.Output, item tf.Output) (output_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListSetItem",
-		Input: []tf.Input{
-			input_handle, index, item,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SelfAdjointEigV2Attr is an optional argument to SelfAdjointEigV2.
-type SelfAdjointEigV2Attr func(optionalAttr)
-
-// SelfAdjointEigV2ComputeV sets the optional compute_v attribute to value.
-//
-// value: If `True` then eigenvectors will be computed and returned in `v`.
-// Otherwise, only the eigenvalues will be computed.
-// If not specified, defaults to true
-func SelfAdjointEigV2ComputeV(value bool) SelfAdjointEigV2Attr {
-	return func(m optionalAttr) {
-		m["compute_v"] = value
-	}
-}
-
-// Computes the eigen decomposition of one or more square self-adjoint matrices.
-//
-// Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
-// `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`. The eigenvalues
-// are sorted in non-decreasing order.
-//
-// ```python
-// # a is a tensor.
-// # e is a tensor of eigenvalues.
-// # v is a tensor of eigenvectors.
-// e, v = self_adjoint_eig(a)
-// e = self_adjoint_eig(a, compute_v=False)
-// ```
-//
-// Arguments:
-//	input: `Tensor` input of shape `[N, N]`.
-//
-// Returns Eigenvalues. Shape is `[N]`.Eigenvectors. Shape is `[N, N]`.
-func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV2Attr) (e tf.Output, v tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SelfAdjointEigV2",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Creates a Tensor by indexing into the TensorList.
-//
-// Each row in the produced Tensor corresponds to the element in the TensorList
-// specified by the given index (see `tf.gather`).
-//
-// input_handle: The input tensor list.
-// indices: The indices used to index into the list.
-// values: The tensor.
-func TensorListGather(scope *Scope, input_handle tf.Output, indices tf.Output, element_shape tf.Output, element_dtype tf.DataType) (values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorListGather",
-		Input: []tf.Input{
-			input_handle, indices, element_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the Gauss error function of `x` element-wise.
-func Erf(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Erf",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ProdAttr is an optional argument to Prod.
-type ProdAttr func(optionalAttr)
-
-// ProdKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func ProdKeepDims(value bool) ProdAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the product of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func Prod(scope *Scope, input tf.Output, axis tf.Output, optional ...ProdAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Prod",
-		Input: []tf.Input{
-			input, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a TensorList by indexing into a Tensor.
-//
-// Each member of the TensorList corresponds to one row of the input tensor,
-// specified by the given index (see `tf.gather`).
-//
-// tensor: The input tensor.
-// indices: The indices used to index into the list.
-// element_shape: The shape of the elements in the list (can be less specified than
-//   the shape of the tensor).
-// output_handle: The TensorList.
-func TensorListScatter(scope *Scope, tensor tf.Output, indices tf.Output, element_shape tf.Output) (output_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListScatter",
-		Input: []tf.Input{
-			tensor, indices, element_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
-type MutableDenseHashTableV2Attr func(optionalAttr)
-
-// MutableDenseHashTableV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutableDenseHashTableV2Container(value string) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MutableDenseHashTableV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func MutableDenseHashTableV2SharedName(value string) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// MutableDenseHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-// If not specified, defaults to false
-func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
-	}
-}
-
-// MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
-//
-// value: The shape of each value.
-// If not specified, defaults to <>
-func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["value_shape"] = value
-	}
-}
-
-// MutableDenseHashTableV2InitialNumBuckets sets the optional initial_num_buckets attribute to value.
-//
-// value: The initial number of hash table buckets. Must be a power
-// to 2.
-// If not specified, defaults to 131072
-func MutableDenseHashTableV2InitialNumBuckets(value int64) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["initial_num_buckets"] = value
-	}
-}
-
-// MutableDenseHashTableV2MaxLoadFactor sets the optional max_load_factor attribute to value.
-//
-// value: The maximum ratio between number of entries and number of
-// buckets before growing the table. Must be between 0 and 1.
-// If not specified, defaults to 0.8
-func MutableDenseHashTableV2MaxLoadFactor(value float32) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["max_load_factor"] = value
-	}
-}
-
-// Creates an empty hash table that uses tensors as the backing store.
-//
-// It uses "open addressing" with quadratic reprobing to resolve
-// collisions.
-//
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a scalar. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
-//
-// Arguments:
-//	empty_key: The key used to represent empty key buckets internally. Must not
-// be used in insert or lookup operations.
-//
-//	value_dtype: Type of the table values.
-//
-// Returns Handle to a table.
-func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, deleted_key tf.Output, value_dtype tf.DataType, optional ...MutableDenseHashTableV2Attr) (table_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"value_dtype": value_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MutableDenseHashTableV2",
-		Input: []tf.Input{
-			empty_key, deleted_key,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Scatters tensor at indices in an input list.
-//
-// Each member of the TensorList corresponds to one row of the input tensor,
-// specified by the given index (see `tf.gather`).
-//
-// input_handle: The list to scatter into.
-// tensor: The input tensor.
-// indices: The indices used to index into the list.
-// output_handle: The TensorList.
-func TensorListScatterIntoExistingList(scope *Scope, input_handle tf.Output, tensor tf.Output, indices tf.Output) (output_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListScatterIntoExistingList",
-		Input: []tf.Input{
-			input_handle, tensor, indices,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Store the input tensor in the state of the current session.
-//
-// Arguments:
-//	value: The tensor to be stored.
-//
-// Returns The handle for the tensor stored in the session state, represented
-// as a ResourceHandle object.
-func GetSessionHandleV2(scope *Scope, value tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "GetSessionHandleV2",
-		Input: []tf.Input{
-			value,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the sign and the log of the absolute value of the determinant of
-//
-// one or more square matrices.
-//
-// The input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions
-// form square matrices. The outputs are two tensors containing the signs and
-// absolute values of the log determinants for all N input submatrices
-// `[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).
-// The log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU
-// is the LU decomposition of the input and P is the corresponding
-// permutation matrix.
-//
-// Arguments:
-//	input: Shape is `[N, M, M]`.
-//
-// Returns The signs of the log determinants of the inputs. Shape is `[N]`.The logs of the absolute values of the determinants
-// of the N input matrices.  Shape is `[N]`.
-func LogMatrixDeterminant(scope *Scope, input tf.Output) (sign tf.Output, log_abs_determinant tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LogMatrixDeterminant",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// MatrixInverseAttr is an optional argument to MatrixInverse.
-type MatrixInverseAttr func(optionalAttr)
-
-// MatrixInverseAdjoint sets the optional adjoint attribute to value.
-// If not specified, defaults to false
-func MatrixInverseAdjoint(value bool) MatrixInverseAttr {
-	return func(m optionalAttr) {
-		m["adjoint"] = value
-	}
-}
-
-// Computes the inverse of one or more square invertible matrices or their
-//
-// adjoints (conjugate transposes).
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the inverse for all input submatrices `[..., :, :]`.
-//
-// The op uses LU decomposition with partial pivoting to compute the inverses.
-//
-// If a matrix is not invertible there is no guarantee what the op does. It
-// may detect the condition and raise an exception or it may simply return a
-// garbage result.
-//
-// Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[..., M, M]`.
-//
-// @compatibility(numpy)
-// Equivalent to np.linalg.inv
-// @end_compatibility
-func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixInverse",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MatrixSolveAttr is an optional argument to MatrixSolve.
-type MatrixSolveAttr func(optionalAttr)
-
-// MatrixSolveAdjoint sets the optional adjoint attribute to value.
-//
-// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
-// adjoint.
-// If not specified, defaults to false
-func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
-	return func(m optionalAttr) {
-		m["adjoint"] = value
-	}
-}
-
-// Solves systems of linear equations.
-//
-// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
-// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
-// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-// If `adjoint` is `True` then each output matrix satisfies
-// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
-//
-// Arguments:
-//	matrix: Shape is `[..., M, M]`.
-//	rhs: Shape is `[..., M, K]`.
-//
-// Returns Shape is `[..., M, K]`.
-func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixSolve",
-		Input: []tf.Input{
-			matrix, rhs,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
-type MatrixSolveLsAttr func(optionalAttr)
-
-// MatrixSolveLsFast sets the optional fast attribute to value.
-// If not specified, defaults to true
-func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
-	return func(m optionalAttr) {
-		m["fast"] = value
-	}
-}
-
-// Solves one or more linear least-squares problems.
-//
-// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
-// type as `matrix` and shape `[..., M, K]`.
-// The output is a tensor shape `[..., N, K]` where each output matrix solves
-// each of the equations
-// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
-// in the least squares sense.
-//
-// We use the following notation for (complex) matrix and right-hand sides
-// in the batch:
-//
-// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
-// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
-// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
-// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
-//
-// If `fast` is `True`, then the solution is computed by solving the normal
-// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
-// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
-// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 + \lambda ||Z||_F^2\\).
-// If \\(m \lt n\\) then `output` is computed as
-// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
-// minimum-norm solution to the under-determined linear system, i.e.
-// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
-// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
-// when \\(A\\) is numerically full rank and has a condition number
-// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or \\(\lambda\\) is
-// sufficiently large.
-//
-// If `fast` is `False` an algorithm based on the numerically robust complete
-// orthogonal decomposition is used. This computes the minimum-norm
-// least-squares solution, even when \\(A\\) is rank deficient. This path is
-// typically 6-7 times slower than the fast path. If `fast` is `False` then
-// `l2_regularizer` is ignored.
-//
-// Arguments:
-//	matrix: Shape is `[..., M, N]`.
-//	rhs: Shape is `[..., M, K]`.
-//	l2_regularizer: Scalar tensor.
-//
-// @compatibility(numpy)
-// Equivalent to np.linalg.lstsq
-// @end_compatibility
-//
-// Returns Shape is `[..., N, K]`.
-func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixSolveLs",
-		Input: []tf.Input{
-			matrix, rhs, l2_regularizer,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the matrix square root of one or more square matrices:
-//
-// matmul(sqrtm(A), sqrtm(A)) = A
-//
-// The input matrix should be invertible. If the input matrix is real, it should
-// have no eigenvalues which are real and negative (pairs of complex conjugate
-// eigenvalues are allowed).
-//
-// The matrix square root is computed by first reducing the matrix to
-// quasi-triangular form with the real Schur decomposition. The square root
-// of the quasi-triangular matrix is then computed directly. Details of
-// the algorithm can be found in: Nicholas J. Higham, "Computing real
-// square roots of a real matrix", Linear Algebra Appl., 1987.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the matrix square root for all input submatrices `[..., :, :]`.
-//
-// Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[..., M, M]`.
-//
-// @compatibility(scipy)
-// Equivalent to scipy.linalg.sqrtm
-// @end_compatibility
-func MatrixSquareRoot(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixSquareRoot",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CumsumAttr is an optional argument to Cumsum.
-type CumsumAttr func(optionalAttr)
-
-// CumsumExclusive sets the optional exclusive attribute to value.
-//
-// value: If `True`, perform exclusive cumsum.
-// If not specified, defaults to false
-func CumsumExclusive(value bool) CumsumAttr {
-	return func(m optionalAttr) {
-		m["exclusive"] = value
-	}
-}
-
-// CumsumReverse sets the optional reverse attribute to value.
-//
-// value: A `bool` (default: False).
-// If not specified, defaults to false
-func CumsumReverse(value bool) CumsumAttr {
-	return func(m optionalAttr) {
-		m["reverse"] = value
-	}
-}
-
-// Compute the cumulative sum of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumsum, which means that the first
-// element of the input is identical to the first element of the output:
-//
-// ```python
-// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
-// ```
-//
-// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
-// performed instead:
-//
-// ```python
-// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
-// ```
-//
-// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
-// opposite direction:
-//
-// ```python
-// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
-// ```
-//
-// This is more efficient than using separate `tf.reverse` ops.
-//
-// The `reverse` and `exclusive` kwargs can also be combined:
-//
-// ```python
-// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
-// ```
-//
-// Arguments:
-//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-// `[-rank(x), rank(x))`.
-func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Cumsum",
-		Input: []tf.Input{
-			x, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Enqueue a Tensor on the computation outfeed.
-//
-// Arguments:
-//	input: A tensor that will be inserted into the outfeed queue.
-//
-// Returns the created operation.
-func OutfeedEnqueue(scope *Scope, input tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "OutfeedEnqueue",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes the gradient of the sigmoid of `x` wrt its input.
-//
-// Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
-// `dy` is the corresponding input gradient.
-func SigmoidGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SigmoidGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QrAttr is an optional argument to Qr.
-type QrAttr func(optionalAttr)
-
-// QrFullMatrices sets the optional full_matrices attribute to value.
-//
-// value: If true, compute full-sized `q` and `r`. If false
-// (the default), compute only the leading `P` columns of `q`.
-// If not specified, defaults to false
-func QrFullMatrices(value bool) QrAttr {
-	return func(m optionalAttr) {
-		m["full_matrices"] = value
-	}
-}
-
-// Computes the QR decompositions of one or more matrices.
-//
-// Computes the QR decomposition of each inner matrix in `tensor` such that
-// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
-//
-// ```python
-// # a is a tensor.
-// # q is a tensor of orthonormal matrices.
-// # r is a tensor of upper triangular matrices.
-// q, r = qr(a)
-// q_full, r_full = qr(a, full_matrices=True)
-// ```
-//
-// Arguments:
-//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
-//
-// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
-// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
-// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
-// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
-func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Qr",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// SvdAttr is an optional argument to Svd.
-type SvdAttr func(optionalAttr)
-
-// SvdComputeUv sets the optional compute_uv attribute to value.
-//
-// value: If true, left and right singular vectors will be
-// computed and returned in `u` and `v`, respectively.
-// If false, `u` and `v` are not set and should never referenced.
-// If not specified, defaults to true
-func SvdComputeUv(value bool) SvdAttr {
-	return func(m optionalAttr) {
-		m["compute_uv"] = value
-	}
-}
-
-// SvdFullMatrices sets the optional full_matrices attribute to value.
-//
-// value: If true, compute full-sized `u` and `v`. If false
-// (the default), compute only the leading `P` singular vectors.
-// Ignored if `compute_uv` is `False`.
-// If not specified, defaults to false
-func SvdFullMatrices(value bool) SvdAttr {
-	return func(m optionalAttr) {
-		m["full_matrices"] = value
-	}
-}
-
-// Computes the singular value decompositions of one or more matrices.
-//
-// Computes the SVD of each inner matrix in `input` such that
-// `input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
-//
-// ```python
-// # a is a tensor containing a batch of matrices.
-// # s is a tensor of singular values for each matrix.
-// # u is the tensor containing of left singular vectors for each matrix.
-// # v is the tensor containing of right singular vectors for each matrix.
-// s, u, v = svd(a)
-// s, _, _ = svd(a, compute_uv=False)
-// ```
-//
-// Arguments:
-//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
-//
-// Returns Singular values. Shape is `[..., P]`.Left singular vectors. If `full_matrices` is `False` then shape is
-// `[..., M, P]`; if `full_matrices` is `True` then shape is
-// `[..., M, M]`. Undefined if `compute_uv` is `False`.Left singular vectors. If `full_matrices` is `False` then shape is
-// `[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.
-// Undefined if `compute_uv` is false.
-func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.Output, v tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Svd",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns a serialized GraphDef representing `input_dataset`.
-//
-// Returns a graph representation for `input_dataset`.
-//
-// Arguments:
-//	input_dataset: A variant tensor representing the dataset to return the graph representation for.
-//
-// Returns The graph representation of the dataset (as serialized GraphDef).
-func DatasetToGraph(scope *Scope, input_dataset tf.Output) (graph tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DatasetToGraph",
-		Input: []tf.Input{
-			input_dataset,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingRMSPropParametersAttr is an optional argument to RetrieveTPUEmbeddingRMSPropParameters.
-type RetrieveTPUEmbeddingRMSPropParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingRMSPropParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingRMSPropParametersTableId(value int64) RetrieveTPUEmbeddingRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingRMSPropParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingRMSPropParametersTableName(value string) RetrieveTPUEmbeddingRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve RMSProp embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the RMSProp optimization algorithm.Parameter ms updated by the RMSProp optimization algorithm.Parameter mom updated by the RMSProp optimization algorithm.
-func RetrieveTPUEmbeddingRMSPropParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingRMSPropParametersAttr) (parameters tf.Output, ms tf.Output, mom tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingRMSPropParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// AllCandidateSamplerAttr is an optional argument to AllCandidateSampler.
-type AllCandidateSamplerAttr func(optionalAttr)
-
-// AllCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func AllCandidateSamplerSeed(value int64) AllCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// AllCandidateSamplerSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Generates labels for candidate sampling with a learned unigram distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
-//
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
-//
-// Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to produce.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AllCandidateSampler",
-		Input: []tf.Input{
-			true_classes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Outputs a `Summary` protocol buffer with scalar values.
-//
-// The input `tags` and `values` must have the same shape.  The generated summary
-// has a summary value for each tag-value pair in `tags` and `values`.
-//
-// Arguments:
-//	tags: Tags for the summary.
-//	values: Same shape as `tags.  Values for the summary.
-//
-// Returns Scalar.  Serialized `Summary` protocol buffer.
-func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ScalarSummary",
-		Input: []tf.Input{
-			tags, values,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the truth value of x AND y element-wise.
-//
-// *NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LogicalAnd",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingProximalAdagradParametersAttr is an optional argument to RetrieveTPUEmbeddingProximalAdagradParameters.
-type RetrieveTPUEmbeddingProximalAdagradParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingProximalAdagradParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingProximalAdagradParametersTableId(value int64) RetrieveTPUEmbeddingProximalAdagradParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingProximalAdagradParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingProximalAdagradParametersTableName(value string) RetrieveTPUEmbeddingProximalAdagradParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve proximal Adagrad embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the proximal Adagrad optimization algorithm.Parameter accumulators updated by the proximal Adagrad optimization algorithm.
-func RetrieveTPUEmbeddingProximalAdagradParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingProximalAdagradParametersAttr) (parameters tf.Output, accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingProximalAdagradParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Saves tensors in V2 checkpoint format.
-//
-// By default, saves the named tensors in full.  If the caller wishes to save
-// specific slices of full tensors, "shape_and_slices" should be non-empty strings
-// and correspondingly well-formed.
-//
-// Arguments:
-//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
-// write the tensors.
-//	tensor_names: shape {N}. The names of the tensors to be saved.
-//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
-// Empty strings indicate that they are non-partitioned tensors.
-//	tensors: `N` tensors to save.
-//
-// Returns the created operation.
-func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SaveV2",
-		Input: []tf.Input{
-			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Restores tensors from a V2 checkpoint.
-//
-// For backward compatibility with the V1 format, this Op currently allows
-// restoring from a V1 checkpoint as well:
-//   - This Op first attempts to find the V2 index file pointed to by "prefix", and
-//     if found proceed to read it as a V2 checkpoint;
-//   - Otherwise the V1 read path is invoked.
-// Relying on this behavior is not recommended, as the ability to fall back to read
-// V1 might be deprecated and eventually removed.
-//
-// By default, restores the named tensors in full.  If the caller wishes to restore
-// specific slices of stored tensors, "shape_and_slices" should be non-empty
-// strings and correspondingly well-formed.
-//
-// Callers must ensure all the named tensors are indeed stored in the checkpoint.
-//
-// Arguments:
-//	prefix: Must have a single element.  The prefix of a V2 checkpoint.
-//	tensor_names: shape {N}.  The names of the tensors to be restored.
-//	shape_and_slices: shape {N}.  The slice specs of the tensors to be restored.
-// Empty strings indicate that they are non-partitioned tensors.
-//	dtypes: shape {N}.  The list of expected dtype for the tensors.  Must match
-// those stored in the checkpoint.
-//
-// Returns shape {N}.  The restored tensors, whose shapes are read from the
-// checkpoint directly.
-func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, dtypes []tf.DataType) (tensors []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	opspec := tf.OpSpec{
-		Type: "RestoreV2",
-		Input: []tf.Input{
-			prefix, tensor_names, shape_and_slices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if tensors, idx, err = makeOutputList(op, idx, "tensors"); err != nil {
-		scope.UpdateErr("RestoreV2", err)
-		return
-	}
-	return tensors
-}
-
-// Saves the input tensors to disk.
-//
-// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
-// is written to `filename` with name `tensor_names[i]`.
-//
-// See also `SaveSlices`.
-//
-// Arguments:
-//	filename: Must have a single element. The name of the file to which we write
-// the tensor.
-//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
-//	data: `N` tensors to save.
-//
-// Returns the created operation.
-func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Save",
-		Input: []tf.Input{
-			filename, tensor_names, tf.OutputList(data),
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// QueueCloseV2Attr is an optional argument to QueueCloseV2.
-type QueueCloseV2Attr func(optionalAttr)
-
-// QueueCloseV2CancelPendingEnqueues sets the optional cancel_pending_enqueues attribute to value.
-//
-// value: If true, all pending enqueue requests that are
-// blocked on the given queue will be canceled.
-// If not specified, defaults to false
-func QueueCloseV2CancelPendingEnqueues(value bool) QueueCloseV2Attr {
-	return func(m optionalAttr) {
-		m["cancel_pending_enqueues"] = value
-	}
-}
-
-// Closes the given queue.
-//
-// This operation signals that no more elements will be enqueued in the
-// given queue. Subsequent Enqueue(Many) operations will fail.
-// Subsequent Dequeue(Many) operations will continue to succeed if
-// sufficient elements remain in the queue. Subsequent Dequeue(Many)
-// operations that would block will fail immediately.
-//
-// Arguments:
-//	handle: The handle to a queue.
-//
-// Returns the created operation.
-func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QueueCloseV2",
-		Input: []tf.Input{
-			handle,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Deprecated. Disallowed in GraphDef version >= 2.
 //
 // DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
@@ -36759,1489 +38753,197 @@ func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, m
 	return op.Output(0)
 }
 
-// Saves input tensors slices to disk.
+// Gather slices from `params` into a Tensor with shape specified by `indices`.
 //
-// This is like `Save` except that tensors can be listed in the saved file as being
-// a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
-// larger tensor and the slice that this tensor covers. `shapes_and_slices` must
-// have as many elements as `tensor_names`.
+// `indices` is an K-dimensional integer tensor, best thought of as a
+// (K-1)-dimensional tensor of indices into `params`, where each element defines a
+// slice of `params`:
 //
-// Elements of the `shapes_and_slices` input must either be:
+//     output[\\(i_0, ..., i_{K-2}\\)] = params[indices[\\(i_0, ..., i_{K-2}\\)]]
 //
-// *  The empty string, in which case the corresponding tensor is
-//    saved normally.
-// *  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
-//    `dimI` are the dimensions of the larger tensor and `slice-spec`
-//    specifies what part is covered by the tensor to save.
+// Whereas in `tf.gather` `indices` defines slices into the first
+// dimension of `params`, in `tf.gather_nd`, `indices` defines slices into the
+// first `N` dimensions of `params`, where `N = indices.shape[-1]`.
 //
-// `slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
-// where each `sliceI` is either:
+// The last dimension of `indices` can be at most the rank of
+// `params`:
 //
-// *  The string `-` meaning that the slice covers all indices of this dimension
-// *  `start,length` where `start` and `length` are integers.  In that
-//    case the slice covers `length` indices starting at `start`.
+//     indices.shape[-1] <= params.rank
 //
-// See also `Save`.
+// The last dimension of `indices` corresponds to elements
+// (if `indices.shape[-1] == params.rank`) or slices
+// (if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]`
+// of `params`.  The output tensor has shape
 //
-// Arguments:
-//	filename: Must have a single element. The name of the file to which we write the
-// tensor.
-//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
-//	shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
-// saving the tensors.
-//	data: `N` tensors to save.
+//     indices.shape[:-1] + params.shape[indices.shape[-1]:]
 //
-// Returns the created operation.
-func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes_and_slices tf.Output, data []tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SaveSlices",
-		Input: []tf.Input{
-			filename, tensor_names, shapes_and_slices, tf.OutputList(data),
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// RestoreSliceAttr is an optional argument to RestoreSlice.
-type RestoreSliceAttr func(optionalAttr)
-
-// RestoreSlicePreferredShard sets the optional preferred_shard attribute to value.
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, a 0 is stored in the
+// corresponding output value.
 //
-// value: Index of file to open first if multiple files match
-// `file_pattern`. See the documentation for `Restore`.
-// If not specified, defaults to -1
-func RestoreSlicePreferredShard(value int64) RestoreSliceAttr {
-	return func(m optionalAttr) {
-		m["preferred_shard"] = value
-	}
-}
-
-// Restores a tensor from checkpoint files.
+// Some examples below.
 //
-// This is like `Restore` except that restored tensor can be listed as filling
-// only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
-// larger tensor and the slice that the restored tensor covers.
-//
-// The `shape_and_slice` input has the same format as the
-// elements of the `shapes_and_slices` input of the `SaveSlices` op.
-//
-// Arguments:
-//	file_pattern: Must have a single element. The pattern of the files from
-// which we read the tensor.
-//	tensor_name: Must have a single element. The name of the tensor to be
-// restored.
-//	shape_and_slice: Scalar. The shapes and slice specifications to use when
-// restoring a tensors.
-//	dt: The type of the tensor to be restored.
-//
-// Returns The restored tensor.
-func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, shape_and_slice tf.Output, dt tf.DataType, optional ...RestoreSliceAttr) (tensor tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dt": dt}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RestoreSlice",
-		Input: []tf.Input{
-			file_pattern, tensor_name, shape_and_slice,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Extract `patches` from `images` and put them in the "depth" output dimension.
-//
-// Arguments:
-//	images: 4-D Tensor with shape `[batch, in_rows, in_cols, depth]`.
-//	ksizes: The size of the sliding window for each dimension of `images`.
-//	strides: 1-D of length 4. How far the centers of two consecutive patches are in
-// the images. Must be: `[1, stride_rows, stride_cols, 1]`.
-//	rates: 1-D of length 4. Must be: `[1, rate_rows, rate_cols, 1]`. This is the
-// input stride, specifying how far two consecutive patch samples are in the
-// input. Equivalent to extracting patches with
-// `patch_sizes_eff = patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by
-// subsampling them spatially by a factor of `rates`. This is equivalent to
-// `rate` in dilated (a.k.a. Atrous) convolutions.
-//	padding: The type of padding algorithm to use.
-//
-// We specify the size-related attributes as:
+// Simple indexing into a matrix:
 //
 // ```python
-//       ksizes = [1, ksize_rows, ksize_cols, 1]
-//       strides = [1, strides_rows, strides_cols, 1]
-//       rates = [1, rates_rows, rates_cols, 1]
+//     indices = [[0, 0], [1, 1]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = ['a', 'd']
 // ```
 //
-// Returns 4-D Tensor with shape `[batch, out_rows, out_cols, ksize_rows *
-// ksize_cols * depth]` containing image patches with size
-// `ksize_rows x ksize_cols x depth` vectorized in the "depth" dimension. Note
-// `out_rows` and `out_cols` are the dimensions of the output patches.
-func ExtractImagePatches(scope *Scope, images tf.Output, ksizes []int64, strides []int64, rates []int64, padding string) (patches tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksizes": ksizes, "strides": strides, "rates": rates, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "ExtractImagePatches",
-		Input: []tf.Input{
-			images,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Generate a glob pattern matching all sharded file names.
-func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ShardedFilespec",
-		Input: []tf.Input{
-			basename, num_shards,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Mutually accumulates multiple tensors of identical type and shape.
-func CollectiveGather(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
-	opspec := tf.OpSpec{
-		Type: "CollectiveGather",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Converts the given variant tensor to an iterator and stores it in the given resource.
-//
-// Arguments:
-//	resource_handle: A handle to an iterator resource.
-//	serialized: A variant tensor storing the state of the iterator contained in the
-// resource.
-//
-// Returns the created operation.
-func DeserializeIterator(scope *Scope, resource_handle tf.Output, serialized tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DeserializeIterator",
-		Input: []tf.Input{
-			resource_handle, serialized,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
-type ResourceSparseApplyMomentumAttr func(optionalAttr)
-
-// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, the tensor passed to compute grad will be
-// var - lr * momentum * accum, so in the end, the var you get is actually
-// var - lr * momentum * accum.
-// If not specified, defaults to false
-func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
-//
-// Set use_nesterov = True if you want to use Nesterov momentum.
-//
-// That is for rows we have grad for, we update var and accum as follows:
-//
-// accum = accum * momentum + grad
-// var -= lr * accum
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	momentum: Momentum. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyMomentum",
-		Input: []tf.Input{
-			var_, accum, lr, grad, indices, momentum,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// WholeFileReaderV2Attr is an optional argument to WholeFileReaderV2.
-type WholeFileReaderV2Attr func(optionalAttr)
-
-// WholeFileReaderV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func WholeFileReaderV2Container(value string) WholeFileReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// WholeFileReaderV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func WholeFileReaderV2SharedName(value string) WholeFileReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A Reader that outputs the entire contents of a file as a value.
-//
-// To use, enqueue filenames in a Queue.  The output of ReaderRead will
-// be a filename (key) and the contents of that file (value).
-//
-// Returns The handle to reference the Reader.
-func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "WholeFileReaderV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MapUnstageAttr is an optional argument to MapUnstage.
-type MapUnstageAttr func(optionalAttr)
-
-// MapUnstageCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapUnstageCapacity(value int64) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapUnstageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapUnstageMemoryLimit(value int64) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapUnstageContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapUnstageContainer(value string) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapUnstageSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapUnstageSharedName(value string) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes and returns the values associated with the key
-//
-// from the underlying container.   If the underlying container
-// does not contain this key, the op will block until it does.
-func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageAttr) (values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapUnstage",
-		Input: []tf.Input{
-			key, indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapUnstage", err)
-		return
-	}
-	return values
-}
-
-// TFRecordReaderV2Attr is an optional argument to TFRecordReaderV2.
-type TFRecordReaderV2Attr func(optionalAttr)
-
-// TFRecordReaderV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func TFRecordReaderV2Container(value string) TFRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// TFRecordReaderV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func TFRecordReaderV2SharedName(value string) TFRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// TFRecordReaderV2CompressionType sets the optional compression_type attribute to value.
-// If not specified, defaults to ""
-func TFRecordReaderV2CompressionType(value string) TFRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["compression_type"] = value
-	}
-}
-
-// A Reader that outputs the records from a TensorFlow Records file.
-//
-// Returns The handle to reference the Reader.
-func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TFRecordReaderV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// PaddingFIFOQueueV2Attr is an optional argument to PaddingFIFOQueueV2.
-type PaddingFIFOQueueV2Attr func(optionalAttr)
-
-// PaddingFIFOQueueV2Shapes sets the optional shapes attribute to value.
-//
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types.
-// Shapes of fixed rank but variable size are allowed by setting
-// any shape dimension to -1.  In this case, the inputs' shape may vary along
-// the given dimension, and DequeueMany will pad the given dimension with
-// zeros up to the maximum shape of all elements in the given batch.
-// If the length of this attr is 0, different queue elements may have
-// different ranks and shapes, but only one element may be dequeued at a time.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func PaddingFIFOQueueV2Shapes(value []tf.Shape) PaddingFIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shapes"] = value
-	}
-}
-
-// PaddingFIFOQueueV2Capacity sets the optional capacity attribute to value.
-//
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func PaddingFIFOQueueV2Capacity(value int64) PaddingFIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// PaddingFIFOQueueV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func PaddingFIFOQueueV2Container(value string) PaddingFIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// PaddingFIFOQueueV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func PaddingFIFOQueueV2SharedName(value string) PaddingFIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A queue that produces elements in first-in first-out order.
-//
-// Variable-size shapes are allowed by setting the corresponding shape dimensions
-// to 0 in the shape attr.  In this case DequeueMany will pad up to the maximum
-// size of any given element in the minibatch.  See below for details.
-//
-// Arguments:
-//	component_types: The type of each component in a value.
-//
-// Returns The handle to the queue.
-func PaddingFIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...PaddingFIFOQueueV2Attr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"component_types": component_types}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "PaddingFIFOQueueV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the number of work units this Reader has finished processing.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderNumWorkUnitsCompletedV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Converts a flat index or array of flat indices into a tuple of
-//
-// coordinate arrays.
-//
-// @compatibility(numpy)
-// Equivalent to np.unravel_index
-// @end_compatibility
-//
-// Arguments:
-//	indices: An 0-D or 1-D `int` Tensor whose elements are indices into the
-// flattened version of an array of dimensions dims.
-//	dims: An 1-D `int` Tensor. The shape of the array to use for unraveling
-// indices.
-//
-// Returns An 2-D (or 1-D if indices is 0-D) tensor where each row has the
-// same shape as the indices array.
-func UnravelIndex(scope *Scope, indices tf.Output, dims tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "UnravelIndex",
-		Input: []tf.Input{
-			indices, dims,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Gather ragged slices from `params` axis `0` according to `indices`.
-//
-// Outputs a `RaggedTensor` output composed from `output_dense_values` and
-// `output_nested_splits`, such that:
+// Slice indexing into a matrix:
 //
 // ```python
-// output.shape = indices.shape + params.shape[1:]
-// output.ragged_rank = indices.shape.ndims + params.ragged_rank
-// output[i...j, d0...dn] = params[indices[i...j], d0...dn]
+//     indices = [[1], [0]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = [['c', 'd'], ['a', 'b']]
 // ```
 //
-// where
+// Indexing into a 3-tensor:
 //
-// * `params =
-//    ragged.from_nested_row_splits(params_dense_values, params_nested_splits)`
-//    provides the values that should be gathered.
-// * `indices` ia a dense tensor with dtype `int32` or `int64`, indicating which
-//    values should be gathered.
-// * `output =
-//    ragged.from_nested_row_splits(output_dense_values, output_nested_splits)`
-//    is the output tensor.
+// ```python
+//     indices = [[1]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [[['a1', 'b1'], ['c1', 'd1']]]
 //
-// (Note: This c++ op is used to implement the higher-level python
-// `tf.ragged.gather` op, which also supports ragged indices.)
 //
+//     indices = [[0, 1], [1, 0]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [['c0', 'd0'], ['a1', 'b1']]
+//
+//
+//     indices = [[0, 0, 1], [1, 0, 1]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = ['b0', 'b1']
+// ```
+//
+// Batched indexing into a matrix:
+//
+// ```python
+//     indices = [[[0, 0]], [[0, 1]]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = [['a'], ['b']]
+// ```
+//
+// Batched slice indexing into a matrix:
+//
+// ```python
+//     indices = [[[1]], [[0]]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = [[['c', 'd']], [['a', 'b']]]
+// ```
+//
+// Batched indexing into a 3-tensor:
+//
+// ```python
+//     indices = [[[1]], [[0]]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [[[['a1', 'b1'], ['c1', 'd1']]],
+//               [[['a0', 'b0'], ['c0', 'd0']]]]
+//
+//     indices = [[[0, 1], [1, 0]], [[0, 0], [1, 1]]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [[['c0', 'd0'], ['a1', 'b1']],
+//               [['a0', 'b0'], ['c1', 'd1']]]
+//
+//
+//     indices = [[[0, 0, 1], [1, 0, 1]], [[0, 1, 1], [1, 1, 0]]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [['b0', 'b1'], ['d0', 'c1']]
+// ```
+//
+// See also `tf.gather` and `tf.batch_gather`.
 //
 // Arguments:
-//	params_nested_splits: The `nested_row_splits` tensors that define the row-partitioning for the
-// `params` RaggedTensor input.
-//	params_dense_values: The `flat_values` for the `params` RaggedTensor. There was a terminology change
-// at the python level from dense_values to flat_values, so dense_values is the
-// deprecated name.
-//	indices: Indices in the outermost dimension of `params` of the values that should be
-// gathered.
-//	OUTPUT_RAGGED_RANK: The ragged rank of the output RaggedTensor. `output_nested_splits` will contain
-// this number of `row_splits` tensors. This value should equal
-// `indices.shape.ndims + params.ragged_rank - 1`.
+//	params: The tensor from which to gather values.
+//	indices: Index tensor.
 //
-// Returns The `nested_row_splits` tensors that define the row-partitioning for the
-// returned RaggedTensor.The `flat_values` for the returned RaggedTensor.
-func RaggedGather(scope *Scope, params_nested_splits []tf.Output, params_dense_values tf.Output, indices tf.Output, OUTPUT_RAGGED_RANK int64) (output_nested_splits []tf.Output, output_dense_values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"OUTPUT_RAGGED_RANK": OUTPUT_RAGGED_RANK}
-	opspec := tf.OpSpec{
-		Type: "RaggedGather",
-		Input: []tf.Input{
-			tf.OutputList(params_nested_splits), params_dense_values, indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output_nested_splits, idx, err = makeOutputList(op, idx, "output_nested_splits"); err != nil {
-		scope.UpdateErr("RaggedGather", err)
-		return
-	}
-	output_dense_values = op.Output(idx)
-	return output_nested_splits, output_dense_values
-}
-
-// MergeV2CheckpointsAttr is an optional argument to MergeV2Checkpoints.
-type MergeV2CheckpointsAttr func(optionalAttr)
-
-// MergeV2CheckpointsDeleteOldDirs sets the optional delete_old_dirs attribute to value.
-//
-// value: see above.
-// If not specified, defaults to true
-func MergeV2CheckpointsDeleteOldDirs(value bool) MergeV2CheckpointsAttr {
-	return func(m optionalAttr) {
-		m["delete_old_dirs"] = value
-	}
-}
-
-// V2 format specific: merges the metadata files of sharded checkpoints.  The
-//
-// result is one logical checkpoint, with one physical metadata file and renamed
-// data files.
-//
-// Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
-//
-// If delete_old_dirs is true, attempts to delete recursively the dirname of each
-// path in the input checkpoint_prefixes.  This is useful when those paths are non
-// user-facing temporary locations.
-//
-// Arguments:
-//	checkpoint_prefixes: prefixes of V2 checkpoints to merge.
-//	destination_prefix: scalar.  The desired final prefix.  Allowed to be the same
-// as one of the checkpoint_prefixes.
-//
-// Returns the created operation.
-func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination_prefix tf.Output, optional ...MergeV2CheckpointsAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MergeV2Checkpoints",
-		Input: []tf.Input{
-			checkpoint_prefixes, destination_prefix,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Records the latency of producing `input_dataset` elements in a StatsAggregator.
-func ExperimentalLatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalLatencyStatsDataset",
-		Input: []tf.Input{
-			input_dataset, tag,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Produce a string tensor that encodes the state of a Reader.
-//
-// Not all Readers support being serialized, so this can produce an
-// Unimplemented error.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
+// Returns Values from `params` gathered from indices given by `indices`, with
+// shape `indices.shape[:-1] + params.shape[indices.shape[-1]:]`.
+func GatherNd(scope *Scope, params tf.Output, indices tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderSerializeStateV2",
+		Type: "GatherNd",
 		Input: []tf.Input{
-			reader_handle,
+			params, indices,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Restore a reader to a previously saved state.
+// Adjust the saturation of one or more images.
 //
-// Not all Readers support being restored, so this can produce an
-// Unimplemented error.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-//	state: Result of a ReaderSerializeState of a Reader with type
-// matching reader_handle.
-//
-// Returns the created operation.
-func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderRestoreStateV2",
-		Input: []tf.Input{
-			reader_handle, state,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Writes contents to the file at input filename. Creates file and recursively
-//
-// creates directory if not existing.
-//
-// Arguments:
-//	filename: scalar. The name of the file to which we write the contents.
-//	contents: scalar. The content to be written to the output file.
-//
-// Returns the created operation.
-func WriteFile(scope *Scope, filename tf.Output, contents tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "WriteFile",
-		Input: []tf.Input{
-			filename, contents,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// ResizeBicubicGradAttr is an optional argument to ResizeBicubicGrad.
-type ResizeBicubicGradAttr func(optionalAttr)
-
-// ResizeBicubicGradAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
-// aligned. Defaults to false.
-// If not specified, defaults to false
-func ResizeBicubicGradAlignCorners(value bool) ResizeBicubicGradAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// ResizeBicubicGradHalfPixelCenters sets the optional half_pixel_centers attribute to value.
-// If not specified, defaults to false
-func ResizeBicubicGradHalfPixelCenters(value bool) ResizeBicubicGradAttr {
-	return func(m optionalAttr) {
-		m["half_pixel_centers"] = value
-	}
-}
-
-// Computes the gradient of bicubic interpolation.
-//
-// Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
-// The image tensor that was resized.
-//
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
-// Gradients with respect to the input image. Input image must have been
-// float or double.
-func ResizeBicubicGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBicubicGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResizeBicubicGrad",
-		Input: []tf.Input{
-			grads, original_image,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StringLengthAttr is an optional argument to StringLength.
-type StringLengthAttr func(optionalAttr)
-
-// StringLengthUnit sets the optional unit attribute to value.
-//
-// value: The unit that is counted to compute string length.  One of: `"BYTE"` (for
-// the number of bytes in each string) or `"UTF8_CHAR"` (for the number of UTF-8
-// encoded Unicode code points in each string).  Results are undefined
-// if `unit=UTF8_CHAR` and the `input` strings do not contain structurally
-// valid UTF-8.
-// If not specified, defaults to "BYTE"
-func StringLengthUnit(value string) StringLengthAttr {
-	return func(m optionalAttr) {
-		m["unit"] = value
-	}
-}
-
-// String lengths of `input`.
-//
-// Computes the length of each string given in the input tensor.
-//
-// Arguments:
-//	input: The string for which to compute the length.
-//
-// Returns Integer tensor that has the same shape as `input`. The output contains the
-// element-wise string lengths of `input`.
-func StringLength(scope *Scope, input tf.Output, optional ...StringLengthAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StringLength",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
-type ResizeNearestNeighborAttr func(optionalAttr)
-
-// ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// ResizeNearestNeighborHalfPixelCenters sets the optional half_pixel_centers attribute to value.
-// If not specified, defaults to false
-func ResizeNearestNeighborHalfPixelCenters(value bool) ResizeNearestNeighborAttr {
-	return func(m optionalAttr) {
-		m["half_pixel_centers"] = value
-	}
-}
-
-// Resize `images` to `size` using nearest neighbor interpolation.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeNearestNeighborAttr) (resized_images tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResizeNearestNeighbor",
-		Input: []tf.Input{
-			images, size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DecodeJpegAttr is an optional argument to DecodeJpeg.
-type DecodeJpegAttr func(optionalAttr)
-
-// DecodeJpegChannels sets the optional channels attribute to value.
-//
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeJpegChannels(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodeJpegRatio sets the optional ratio attribute to value.
-//
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeJpegRatio(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["ratio"] = value
-	}
-}
-
-// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
-//
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
-// If not specified, defaults to true
-func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
-	}
-}
-
-// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
-//
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
-	}
-}
-
-// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
-//
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
-	}
-}
-
-// DecodeJpegDctMethod sets the optional dct_method attribute to value.
-//
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
-// If not specified, defaults to ""
-func DecodeJpegDctMethod(value string) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["dct_method"] = value
-	}
-}
-
-// Decode a JPEG-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-//
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
-//
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
-//
-//
-// This op also supports decoding PNGs and non-animated GIFs since the interface is
-// the same, though it is cleaner to use `tf.image.decode_image`.
-//
-// Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
-//
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeJpeg",
-		Input: []tf.Input{
-			contents,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ExperimentalThreadPoolHandleAttr is an optional argument to ExperimentalThreadPoolHandle.
-type ExperimentalThreadPoolHandleAttr func(optionalAttr)
-
-// ExperimentalThreadPoolHandleMaxIntraOpParallelism sets the optional max_intra_op_parallelism attribute to value.
-//
-// value: The maximum degree of parallelism to use within operations that execute on this
-// threadpool.
-// If not specified, defaults to 1
-func ExperimentalThreadPoolHandleMaxIntraOpParallelism(value int64) ExperimentalThreadPoolHandleAttr {
-	return func(m optionalAttr) {
-		m["max_intra_op_parallelism"] = value
-	}
-}
-
-// ExperimentalThreadPoolHandleContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func ExperimentalThreadPoolHandleContainer(value string) ExperimentalThreadPoolHandleAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// ExperimentalThreadPoolHandleSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func ExperimentalThreadPoolHandleSharedName(value string) ExperimentalThreadPoolHandleAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
-//
-// Arguments:
-//	num_threads: The number of threads in the thread pool.
-//	display_name: A human-readable name for the threads that may be visible in some
-// visualizations.
-// threadpool.
-//
-// Returns A resource that can be consumed by one or more ExperimentalThreadPoolDataset
-// ops.
-func ExperimentalThreadPoolHandle(scope *Scope, num_threads int64, display_name string, optional ...ExperimentalThreadPoolHandleAttr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_threads": num_threads, "display_name": display_name}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalThreadPoolHandle",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StridedSliceGradAttr is an optional argument to StridedSliceGrad.
-type StridedSliceGradAttr func(optionalAttr)
-
-// StridedSliceGradBeginMask sets the optional begin_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradBeginMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["begin_mask"] = value
-	}
-}
-
-// StridedSliceGradEndMask sets the optional end_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradEndMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["end_mask"] = value
-	}
-}
-
-// StridedSliceGradEllipsisMask sets the optional ellipsis_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradEllipsisMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["ellipsis_mask"] = value
-	}
-}
-
-// StridedSliceGradNewAxisMask sets the optional new_axis_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradNewAxisMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["new_axis_mask"] = value
-	}
-}
-
-// StridedSliceGradShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradShrinkAxisMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["shrink_axis_mask"] = value
-	}
-}
-
-// Returns the gradient of `StridedSlice`.
-//
-// Since `StridedSlice` cuts out pieces of its `input` which is size
-// `shape`, its gradient will have the same shape (which is passed here
-// as `shape`). The gradient will be zero in any element that the slice
-// does not select.
-//
-// Arguments are the same as StridedSliceGrad with the exception that
-// `dy` is the input gradient to be propagated and `shape` is the
-// shape of `StridedSlice`'s `input`.
-func StridedSliceGrad(scope *Scope, shape tf.Output, begin tf.Output, end tf.Output, strides tf.Output, dy tf.Output, optional ...StridedSliceGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StridedSliceGrad",
-		Input: []tf.Input{
-			shape, begin, end, strides, dy,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the value stored in an Optional variant or raises an error if none exists.
-func OptionalGetValue(scope *Scope, optional tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "OptionalGetValue",
-		Input: []tf.Input{
-			optional,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("OptionalGetValue", err)
-		return
-	}
-	return components
-}
-
-// JPEG encode input image with provided compression quality.
-//
-// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
-// `quality` is an int32 jpeg compression quality value between 0 and 100.
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpretted as channels, and must be three.
 //
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A scale is then applied all the saturation
+// values, and then remapped back to RGB colorspace.
 //
 // Arguments:
 //	images: Images to adjust.  At least 3-D.
-//	quality: An int quality to encode to.
+//	scale: A float scale to add to the saturation.
 //
-// Returns 0-D. JPEG-encoded image.
-func EncodeJpegVariableQuality(scope *Scope, images tf.Output, quality tf.Output) (contents tf.Output) {
+// Returns The hue-adjusted image or images.
+func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodeJpegVariableQuality",
+		Type: "AdjustSaturation",
 		Input: []tf.Input{
-			images, quality,
+			images, scale,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QueueEnqueueV2Attr is an optional argument to QueueEnqueueV2.
-type QueueEnqueueV2Attr func(optionalAttr)
-
-// QueueEnqueueV2TimeoutMs sets the optional timeout_ms attribute to value.
+// Draw bounding boxes on a batch of images.
 //
-// value: If the queue is full, this operation will block for up to
-// timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueEnqueueV2TimeoutMs(value int64) QueueEnqueueV2Attr {
-	return func(m optionalAttr) {
-		m["timeout_ms"] = value
-	}
-}
-
-// Enqueues a tuple of one or more tensors in the given queue.
+// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
+// boxes specified by the locations in `boxes`. The coordinates of the each
+// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
 //
-// The components input has k elements, which correspond to the components of
-// tuples stored in the given queue.
+// For example, if an image is 100 x 200 pixels (height x width) and the bounding
+// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
+// the bounding box will be `(40, 10)` to `(100, 50)` (in (x,y) coordinates).
 //
-// N.B. If the queue is full, this operation will block until the given
-// element has been enqueued (or 'timeout_ms' elapses, if specified).
+// Parts of the bounding box may fall outside the image.
 //
 // Arguments:
-//	handle: The handle to a queue.
-//	components: One or more tensors from which the enqueued tensors should be taken.
+//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
+//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
+// boxes.
+//	colors: 2-D. A list of RGBA colors to cycle through for the boxes.
 //
-// Returns the created operation.
-func QueueEnqueueV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueV2Attr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QueueEnqueueV2",
-		Input: []tf.Input{
-			handle, tf.OutputList(components),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// ExtractJpegShapeAttr is an optional argument to ExtractJpegShape.
-type ExtractJpegShapeAttr func(optionalAttr)
-
-// ExtractJpegShapeOutputType sets the optional output_type attribute to value.
-//
-// value: (Optional) The output type of the operation (int32 or int64).
-// Defaults to int32.
-// If not specified, defaults to DT_INT32
-func ExtractJpegShapeOutputType(value tf.DataType) ExtractJpegShapeAttr {
-	return func(m optionalAttr) {
-		m["output_type"] = value
-	}
-}
-
-// Extract the shape information of a JPEG-encoded image.
-//
-// This op only parses the image header, so it is much faster than DecodeJpeg.
-//
-// Arguments:
-//	contents: 0-D. The JPEG-encoded image.
-//
-// Returns 1-D. The image shape with format [height, width, channels].
-func ExtractJpegShape(scope *Scope, contents tf.Output, optional ...ExtractJpegShapeAttr) (image_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ExtractJpegShape",
-		Input: []tf.Input{
-			contents,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the batched diagonal part of a batched tensor.
-//
-// This operation returns a tensor with the `diagonal` part
-// of the batched `input`. The `diagonal` part is computed as follows:
-//
-// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-// tensor of rank `k - 1` with dimensions `[I, J, K, ..., min(M, N)]` where:
-//
-// `diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.
-//
-// The input must be at least a matrix.
-//
-// For example:
-//
-// ```
-// # 'input' is [[[1, 0, 0, 0]
-//                [0, 2, 0, 0]
-//                [0, 0, 3, 0]
-//                [0, 0, 0, 4]],
-//               [[5, 0, 0, 0]
-//                [0, 6, 0, 0]
-//                [0, 0, 7, 0]
-//                [0, 0, 0, 8]]]
-//
-// and input.shape = (2, 4, 4)
-//
-// tf.matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]
-//
-// which has shape (2, 4)
-// ```
-//
-// Arguments:
-//	input: Rank `k` tensor where `k >= 2`.
-//
-// Returns The extracted diagonal(s) having shape
-// `diagonal.shape = input.shape[:-2] + [min(input.shape[-2:])]`.
-func MatrixDiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
+// Returns 4-D with the same shape as `images`. The batch of input images with
+// bounding boxes drawn on the images.
+func DrawBoundingBoxesV2(scope *Scope, images tf.Output, boxes tf.Output, colors tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixDiagPart",
+		Type: "DrawBoundingBoxesV2",
 		Input: []tf.Input{
-			input,
+			images, boxes, colors,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Adjust the contrast of one or more images.
-//
-// `images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
-// interpreted as `[height, width, channels]`.  The other dimensions only
-// represent a collection of images, such as `[batch, height, width, channels].`
-//
-// Contrast is adjusted independently for each channel of each image.
-//
-// For each channel, the Op first computes the mean of the image pixels in the
-// channel and then adjusts each component of each pixel to
-// `(x - mean) * contrast_factor + mean`.
-//
-// Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	contrast_factor: A float multiplier for adjusting contrast.
-//
-// Returns The contrast-adjusted image or images.
-func AdjustContrastv2(scope *Scope, images tf.Output, contrast_factor tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AdjustContrastv2",
-		Input: []tf.Input{
-			images, contrast_factor,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DecodePngAttr is an optional argument to DecodePng.
-type DecodePngAttr func(optionalAttr)
-
-// DecodePngChannels sets the optional channels attribute to value.
-//
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodePngChannels(value int64) DecodePngAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodePngDtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_UINT8
-func DecodePngDtype(value tf.DataType) DecodePngAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Decode a PNG-encoded image to a uint8 or uint16 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the PNG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-// *   4: output an RGBA image.
-//
-// If needed, the PNG-encoded image is transformed to match the requested number
-// of color channels.
-//
-// This op also supports decoding JPEGs and non-animated GIFs since the interface
-// is the same, though it is cleaner to use `tf.image.decode_image`.
-//
-// Arguments:
-//	contents: 0-D.  The PNG-encoded image.
-//
-// Returns 3-D with shape `[height, width, channels]`.
-func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (image tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodePng",
-		Input: []tf.Input{
-			contents,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the mean along sparse segments of a tensor.
-//
-// See `tf.sparse.segment_sum` for usage examples.
-//
-// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
-// dimension, selecting a subset of dimension 0, specified by `indices`.
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentMean",
-		Input: []tf.Input{
-			data, indices, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes a range that covers the actual values present in a quantized tensor.
-//
-// Given a quantized tensor described by `(input, input_min, input_max)`, outputs a
-// range that covers the actual values present in that tensor. This op is typically
-// used to produce the `requested_output_min` and `requested_output_max` for
-// `Requantize`.
-//
-// Arguments:
-//
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//
-// Returns The computed min output.the computed max output.
-func RequantizationRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output) (output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RequantizationRange",
-		Input: []tf.Input{
-			input, input_min, input_max,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
 // EncodePngAttr is an optional argument to EncodePng.
 type EncodePngAttr func(optionalAttr)
 
@@ -38292,52 +38994,6 @@ func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (conten
 	return op.Output(0)
 }
 
-// RetrieveTPUEmbeddingCenteredRMSPropParametersAttr is an optional argument to RetrieveTPUEmbeddingCenteredRMSPropParameters.
-type RetrieveTPUEmbeddingCenteredRMSPropParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingCenteredRMSPropParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingCenteredRMSPropParametersTableId(value int64) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingCenteredRMSPropParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingCenteredRMSPropParametersTableName(value string) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve centered RMSProp embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the centered RMSProp optimization algorithm.Parameter ms updated by the centered RMSProp optimization algorithm.Parameter mom updated by the centered RMSProp optimization algorithm.Parameter mg updated by the centered RMSProp optimization algorithm.
-func RetrieveTPUEmbeddingCenteredRMSPropParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingCenteredRMSPropParametersAttr) (parameters tf.Output, ms tf.Output, mom tf.Output, mg tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingCenteredRMSPropParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
 // DecodeBmpAttr is an optional argument to DecodeBmp.
 type DecodeBmpAttr func(optionalAttr)
 
@@ -38383,112 +39039,26 @@ func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (ima
 	return op.Output(0)
 }
 
-// Decode the frame(s) of a GIF-encoded image to a uint8 tensor.
+// Converts one or more images from RGB to HSV.
 //
-// GIF images with frame or transparency compression are not supported.
-// On Linux and MacOS systems, convert animated GIFs from compressed to
-// uncompressed by running:
-//
-//     convert $src.gif -coalesce $dst.gif
-//
-// This op also supports decoding JPEGs and PNGs, though it is cleaner to use
-// `tf.image.decode_image`.
-//
-// Arguments:
-//	contents: 0-D.  The GIF-encoded image.
-//
-// Returns 4-D with shape `[num_frames, height, width, 3]`. RGB channel order.
-func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeGif",
-		Input: []tf.Input{
-			contents,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
-//
-// For an explanation see "Differentiation of the Cholesky algorithm" by
-// Iain Murray http://arxiv.org/abs/1602.07527.
-//
-// Arguments:
-//	l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
-// Algorithm depends only on lower triangular part of the innermost matrices of
-// this tensor.
-//	grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
-// Algorithm depends only on lower triangular part of the innermost matrices of
-// this tensor.
-//
-// Returns Symmetrized version of df/dA . Shape is `[..., M, M]`
-func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "CholeskyGrad",
-		Input: []tf.Input{
-			l, grad,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the mean along sparse segments of a tensor.
-//
-// Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
-//
-// Returns Has same shape as data, except for dimension 0 which has size
-// `num_segments`.
-func SparseSegmentMeanWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentMeanWithNumSegments",
-		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Convert one or more images from HSV to RGB.
-//
-// Outputs a tensor of the same shape as the `images` tensor, containing the RGB
+// Outputs a tensor of the same shape as the `images` tensor, containing the HSV
 // value of the pixels. The output is only well defined if the value in `images`
 // are in `[0,1]`.
 //
-// See `rgb_to_hsv` for a description of the HSV encoding.
+// `output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
+// `output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
+// corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
 //
 // Arguments:
-//	images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
+//	images: 1-D or higher rank. RGB data to convert. Last dimension must be size 3.
 //
-// Returns `images` converted to RGB.
-func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
+// Returns `images` converted to HSV.
+func RGBToHSV(scope *Scope, images tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "HSVToRGB",
+		Type: "RGBToHSV",
 		Input: []tf.Input{
 			images,
 		},
@@ -38497,105 +39067,32 @@ func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the Eigen Decomposition of a batch of square self-adjoint matrices.
-//
-// DEPRECATED at GraphDef version 11: Use SelfAdjointEigV2 instead.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices, with the same constraints as the single matrix
-// SelfAdjointEig.
-//
-// The result is a [..., M+1, M] matrix with [..., 0,:] containing the
-// eigenvalues, and subsequent [...,1:, :] containing the eigenvectors. The eigenvalues
-// are sorted in non-decreasing order.
-//
-// Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[..., M+1, M]`.
-func SelfAdjointEig(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SelfAdjointEig",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// EncodeProtoAttr is an optional argument to EncodeProto.
+type EncodeProtoAttr func(optionalAttr)
 
-// DecodeProtoV2Attr is an optional argument to DecodeProtoV2.
-type DecodeProtoV2Attr func(optionalAttr)
-
-// DecodeProtoV2DescriptorSource sets the optional descriptor_source attribute to value.
-//
-// value: Either the special value `local://` or a path to a file containing
-// a serialized `FileDescriptorSet`.
+// EncodeProtoDescriptorSource sets the optional descriptor_source attribute to value.
 // If not specified, defaults to "local://"
-func DecodeProtoV2DescriptorSource(value string) DecodeProtoV2Attr {
+func EncodeProtoDescriptorSource(value string) EncodeProtoAttr {
 	return func(m optionalAttr) {
 		m["descriptor_source"] = value
 	}
 }
 
-// DecodeProtoV2MessageFormat sets the optional message_format attribute to value.
+// The op serializes protobuf messages provided in the input tensors.
 //
-// value: Either `binary` or `text`.
-// If not specified, defaults to "binary"
-func DecodeProtoV2MessageFormat(value string) DecodeProtoV2Attr {
-	return func(m optionalAttr) {
-		m["message_format"] = value
-	}
-}
-
-// DecodeProtoV2Sanitize sets the optional sanitize attribute to value.
+// The types of the tensors in `values` must match the schema for the
+// fields specified in `field_names`. All the tensors in `values` must
+// have a common shape prefix, *batch_shape*.
 //
-// value: Whether to sanitize the result or not.
-// If not specified, defaults to false
-func DecodeProtoV2Sanitize(value bool) DecodeProtoV2Attr {
-	return func(m optionalAttr) {
-		m["sanitize"] = value
-	}
-}
-
-// The op extracts fields from a serialized protocol buffers message into tensors.
-//
-// The `decode_proto` op extracts fields from a serialized protocol buffers
-// message into tensors.  The fields in `field_names` are decoded and converted
-// to the corresponding `output_types` if possible.
+// The `sizes` tensor specifies repeat counts for each field.  The repeat
+// count (last dimension) of a each tensor in `values` must be greater
+// than or equal to corresponding repeat count in `sizes`.
 //
 // A `message_type` name must be provided to give context for the field
 // names. The actual message descriptor can be looked up either in the
 // linked-in descriptor pool or a filename provided by the caller using
 // the `descriptor_source` attribute.
 //
-// Each output tensor is a dense tensor. This means that it is padded to
-// hold the largest number of repeated elements seen in the input
-// minibatch. (The shape is also padded by one to prevent zero-sized
-// dimensions). The actual repeat counts for each example in the
-// minibatch can be found in the `sizes` output. In many cases the output
-// of `decode_proto` is fed immediately into tf.squeeze if missing values
-// are not a concern. When using tf.squeeze, always pass the squeeze
-// dimension explicitly to avoid surprises.
-//
-// For the most part, the mapping between Proto field types and
-// TensorFlow dtypes is straightforward. However, there are a few
-// special cases:
-//
-// - A proto field that contains a submessage or group can only be converted
-// to `DT_STRING` (the serialized submessage). This is to reduce the
-// complexity of the API. The resulting string can be used as input
-// to another instance of the decode_proto op.
-//
-// - TensorFlow lacks support for unsigned integers. The ops represent uint64
-// types as a `DT_INT64` with the same twos-complement bit pattern
-// (the obvious way). Unsigned int32 values can be represented exactly by
-// specifying type `DT_INT64`, or using twos-complement if the caller
-// specifies `DT_INT32` in the `output_types` attribute.
-//
 // The `descriptor_source` attribute selects a source of protocol
 // descriptors to consult when looking up `message_type`. This may be a
 // filename containing a serialized `FileDescriptorSet` message,
@@ -38610,48 +39107,41 @@ func DecodeProtoV2Sanitize(value bool) DecodeProtoV2Attr {
 // code via C++ libraries, not Python imports. You can link in a proto descriptor
 // by creating a cc_library target with alwayslink=1.
 //
-// Both binary and text proto serializations are supported, and can be
-// chosen using the `format` attribute.
+// There are a few special cases in the value mapping:
+//
+// Submessage and group fields must be pre-serialized as TensorFlow strings.
+//
+// TensorFlow lacks support for unsigned int64s, so they must be
+// represented as `tf.int64` with the same twos-complement bit pattern
+// (the obvious way).
+//
+// Unsigned int32 values can be represented exactly with `tf.int64`, or
+// with sign wrapping if the input is of type `tf.int32`.
 //
 // Arguments:
-//	bytes: Tensor of serialized protos with shape `batch_shape`.
+//	sizes: Tensor of int32 with shape `[batch_shape, len(field_names)]`.
+//	values: List of tensors containing values for the corresponding field.
+//	field_names: List of strings containing proto field names.
 //	message_type: Name of the proto message type to decode.
-//	field_names: List of strings containing proto field names. An extension field can be decoded
-// by using its full name, e.g. EXT_PACKAGE.EXT_FIELD_NAME.
-//	output_types: List of TF types to use for the respective field in field_names.
 //
-// Returns Tensor of int32 with shape `[batch_shape, len(field_names)]`.
-// Each entry is the number of values found for the corresponding field.
-// Optional fields may have 0 or 1 values.List of tensors containing values for the corresponding field.
-// `values[i]` has datatype `output_types[i]`
-// and shape `[batch_shape, max(sizes[...,i])]`.
-func DecodeProtoV2(scope *Scope, bytes tf.Output, message_type string, field_names []string, output_types []tf.DataType, optional ...DecodeProtoV2Attr) (sizes tf.Output, values []tf.Output) {
+// Returns Tensor of serialized protos with shape `batch_shape`.
+func EncodeProto(scope *Scope, sizes tf.Output, values []tf.Output, field_names []string, message_type string, optional ...EncodeProtoAttr) (bytes tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"message_type": message_type, "field_names": field_names, "output_types": output_types}
+	attrs := map[string]interface{}{"field_names": field_names, "message_type": message_type}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeProtoV2",
+		Type: "EncodeProto",
 		Input: []tf.Input{
-			bytes,
+			sizes, tf.OutputList(values),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	sizes = op.Output(idx)
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("DecodeProtoV2", err)
-		return
-	}
-	return sizes, values
+	return op.Output(0)
 }
 
 // SampleDistortedBoundingBoxV2Attr is an optional argument to SampleDistortedBoundingBoxV2.
@@ -38798,423 +39288,6 @@ func SampleDistortedBoundingBoxV2(scope *Scope, image_size tf.Output, bounding_b
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// SpaceToBatch for N-D tensors of type T.
-//
-// This operation divides "spatial" dimensions `[1, ..., M]` of the input into a
-// grid of blocks of shape `block_shape`, and interleaves these blocks with the
-// "batch" dimension (0) such that in the output, the spatial dimensions
-// `[1, ..., M]` correspond to the position within the grid, and the batch
-// dimension combines both the position within a spatial block and the original
-// batch position.  Prior to division into blocks, the spatial dimensions of the
-// input are optionally zero padded according to `paddings`.  See below for a
-// precise description.
-//
-// Arguments:
-//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
-// where spatial_shape has `M` dimensions.
-//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
-//	paddings: 2-D with shape `[M, 2]`, all values must be >= 0.
-//   `paddings[i] = [pad_start, pad_end]` specifies the padding for input dimension
-//   `i + 1`, which corresponds to spatial dimension `i`.  It is required that
-//   `block_shape[i]` divides `input_shape[i + 1] + pad_start + pad_end`.
-//
-// This operation is equivalent to the following steps:
-//
-// 1. Zero-pad the start and end of dimensions `[1, ..., M]` of the
-//    input according to `paddings` to produce `padded` of shape `padded_shape`.
-//
-// 2. Reshape `padded` to `reshaped_padded` of shape:
-//
-//      [batch] +
-//      [padded_shape[1] / block_shape[0],
-//        block_shape[0],
-//       ...,
-//       padded_shape[M] / block_shape[M-1],
-//       block_shape[M-1]] +
-//      remaining_shape
-//
-// 3. Permute dimensions of `reshaped_padded` to produce
-//    `permuted_reshaped_padded` of shape:
-//
-//      block_shape +
-//      [batch] +
-//      [padded_shape[1] / block_shape[0],
-//       ...,
-//       padded_shape[M] / block_shape[M-1]] +
-//      remaining_shape
-//
-// 4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the batch
-//    dimension, producing an output tensor of shape:
-//
-//      [batch * prod(block_shape)] +
-//      [padded_shape[1] / block_shape[0],
-//       ...,
-//       padded_shape[M] / block_shape[M-1]] +
-//      remaining_shape
-//
-// Some examples:
-//
-// (1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and
-//     `paddings = [[0, 0], [0, 0]]`:
-//
-// ```
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
-//
-// The output tensor has shape `[4, 1, 1, 1]` and value:
-//
-// ```
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-// ```
-//
-// (2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and
-//     `paddings = [[0, 0], [0, 0]]`:
-//
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
-//
-// The output tensor has shape `[4, 1, 1, 3]` and value:
-//
-// ```
-// [[[[1, 2, 3]]], [[[4, 5, 6]]], [[[7, 8, 9]]], [[[10, 11, 12]]]]
-// ```
-//
-// (3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and
-//     `paddings = [[0, 0], [0, 0]]`:
-//
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]],
-//       [[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
-//
-// The output tensor has shape `[4, 2, 2, 1]` and value:
-//
-// ```
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-//
-// (4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and
-//     paddings = `[[0, 0], [2, 0]]`:
-//
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]]],
-//      [[[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
-//
-// The output tensor has shape `[8, 1, 3, 1]` and value:
-//
-// ```
-// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
-//      [[[0], [2], [4]]], [[[0], [10], [12]]],
-//      [[[0], [5], [7]]], [[[0], [13], [15]]],
-//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
-// ```
-//
-// Among others, this operation is useful for reducing atrous convolution into
-// regular convolution.
-func SpaceToBatchND(scope *Scope, input tf.Output, block_shape tf.Output, paddings tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SpaceToBatchND",
-		Input: []tf.Input{
-			input, block_shape, paddings,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Concats all tensors in the list along the 0th dimension.
-//
-// Requires that all tensors have the same shape except the first dimension.
-//
-// input_handle: The input list.
-// element_shape: The shape of the uninitialized elements in the list. If the first
-//   dimension is not -1, it is assumed that all list elements have the same
-//   leading dim.
-// leading_dims: The list of leading dims of uninitialized list elements. Used if
-//   the leading dim of input_handle.element_shape or the element_shape input arg
-//   is not already set.
-// tensor: The concated result.
-// lengths: Output tensor containing sizes of the 0th dimension of tensors in the list, used for computing the gradient.
-//
-func TensorListConcatV2(scope *Scope, input_handle tf.Output, element_shape tf.Output, leading_dims tf.Output, element_dtype tf.DataType) (tensor tf.Output, lengths tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorListConcatV2",
-		Input: []tf.Input{
-			input_handle, element_shape, leading_dims,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// CropAndResizeAttr is an optional argument to CropAndResize.
-type CropAndResizeAttr func(optionalAttr)
-
-// CropAndResizeMethod sets the optional method attribute to value.
-//
-// value: A string specifying the sampling method for resizing. It can be either
-// `"bilinear"` or `"nearest"` and default to `"bilinear"`. Currently two sampling
-// methods are supported: Bilinear and Nearest Neighbor.
-// If not specified, defaults to "bilinear"
-func CropAndResizeMethod(value string) CropAndResizeAttr {
-	return func(m optionalAttr) {
-		m["method"] = value
-	}
-}
-
-// CropAndResizeExtrapolationValue sets the optional extrapolation_value attribute to value.
-//
-// value: Value used for extrapolation, when applicable.
-// If not specified, defaults to 0
-func CropAndResizeExtrapolationValue(value float32) CropAndResizeAttr {
-	return func(m optionalAttr) {
-		m["extrapolation_value"] = value
-	}
-}
-
-// Extracts crops from the input image tensor and resizes them.
-//
-// Extracts crops from the input image tensor and resizes them using bilinear
-// sampling or nearest neighbor sampling (possibly with aspect ratio change) to a
-// common output size specified by `crop_size`. This is more general than the
-// `crop_to_bounding_box` op which extracts a fixed size slice from the input image
-// and does not allow resizing or aspect ratio change.
-//
-// Returns a tensor with `crops` from the input `image` at positions defined at the
-// bounding box locations in `boxes`. The cropped boxes are all resized (with
-// bilinear or nearest neighbor interpolation) to a fixed
-// `size = [crop_height, crop_width]`. The result is a 4-D tensor
-// `[num_boxes, crop_height, crop_width, depth]`. The resizing is corner aligned.
-// In particular, if `boxes = [[0, 0, 1, 1]]`, the method will give identical
-// results to using `tf.image.resize_bilinear()` or
-// `tf.image.resize_nearest_neighbor()`(depends on the `method` argument) with
-// `align_corners=True`.
-//
-// Arguments:
-//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-// Both `image_height` and `image_width` need to be positive.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1]` in image height coordinates. We do allow `y1` > `y2`, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-//	crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
-// cropped image patches are resized to this size. The aspect ratio of the image
-// content is not preserved. Both `crop_height` and `crop_width` need to be
-// positive.
-//
-// Returns A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Output, crop_size tf.Output, optional ...CropAndResizeAttr) (crops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CropAndResize",
-		Input: []tf.Input{
-			image, boxes, box_ind, crop_size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TensorArrayGatherV2Attr is an optional argument to TensorArrayGatherV2.
-type TensorArrayGatherV2Attr func(optionalAttr)
-
-// TensorArrayGatherV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayGatherV2ElementShape(value tf.Shape) TensorArrayGatherV2Attr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
-	}
-}
-
-// Deprecated. Use TensorArrayGatherV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArrayGatherV3
-func TensorArrayGatherV2(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV2Attr) (value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayGatherV2",
-		Input: []tf.Input{
-			handle, indices, flow_in,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
-type NonMaxSuppressionAttr func(optionalAttr)
-
-// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
-//
-// value: A float representing the threshold for deciding whether boxes
-// overlap too much with respect to IOU.
-// If not specified, defaults to 0.5
-func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
-	return func(m optionalAttr) {
-		m["iou_threshold"] = value
-	}
-}
-
-// Greedily selects a subset of bounding boxes in descending order of score,
-//
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system.  Note that this
-// algorithm is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//   selected_indices = tf.image.non_max_suppression(
-//       boxes, scores, max_output_size, iou_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
-//
-// Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "NonMaxSuppression",
-		Input: []tf.Input{
-			boxes, scores, max_output_size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RandomPoissonV2Attr is an optional argument to RandomPoissonV2.
-type RandomPoissonV2Attr func(optionalAttr)
-
-// RandomPoissonV2Seed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomPoissonV2Seed(value int64) RandomPoissonV2Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomPoissonV2Seed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomPoissonV2Seed2(value int64) RandomPoissonV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// RandomPoissonV2Dtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func RandomPoissonV2Dtype(value tf.DataType) RandomPoissonV2Attr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs random values from the Poisson distribution(s) described by rate.
-//
-// This op uses two algorithms, depending on rate. If rate >= 10, then
-// the algorithm by Hormann is used to acquire samples via
-// transformation-rejection.
-// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
-//
-// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
-// random variables.
-// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
-// Programming, Volume 2. Addison Wesley
-//
-// Arguments:
-//	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in rate.
-//	rate: A tensor in which each scalar is a "rate" parameter describing the
-// associated poisson distribution.
-//
-// Returns A tensor with shape `shape + shape(rate)`. Each slice
-// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `rate[i0, i1, ...iN]`.
-func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomPoissonV2",
-		Input: []tf.Input{
-			shape, rate,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Greedily selects a subset of bounding boxes in descending order of score,
 //
 // pruning away boxes that have high intersection-over-union (IOU) overlap
@@ -39308,6 +39381,153 @@ func NonMaxSuppressionV3(scope *Scope, boxes tf.Output, scores tf.Output, max_ou
 	return op.Output(0)
 }
 
+// Conv3DBackpropInputAttr is an optional argument to Conv3DBackpropInput.
+type Conv3DBackpropInputAttr func(optionalAttr)
+
+// Conv3DBackpropInputDilations sets the optional dilations attribute to value.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the input.
+//
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv3DBackpropInput",
+		Input: []tf.Input{
+			input, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// NonMaxSuppressionV4Attr is an optional argument to NonMaxSuppressionV4.
+type NonMaxSuppressionV4Attr func(optionalAttr)
+
+// NonMaxSuppressionV4PadToMaxOutputSize sets the optional pad_to_max_output_size attribute to value.
+//
+// value: If true, the output `selected_indices` is padded to be of length
+// `max_output_size`. Defaults to false.
+// If not specified, defaults to false
+func NonMaxSuppressionV4PadToMaxOutputSize(value bool) NonMaxSuppressionV4Attr {
+	return func(m optionalAttr) {
+		m["pad_to_max_output_size"] = value
+	}
+}
+
+// Greedily selects a subset of bounding boxes in descending order of score,
+//
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes with score less than
+// `score_threshold` are removed.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system and more
+// generally is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//   selected_indices = tf.image.non_max_suppression_v2(
+//       boxes, scores, max_output_size, iou_threshold, score_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
+//
+// Arguments:
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too much with respect to IOU.
+//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
+// boxes based on score.
+//
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.A 0-D integer tensor representing the number of valid elements in
+// `selected_indices`, with the valid elements appearing first.
+func NonMaxSuppressionV4(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output, optional ...NonMaxSuppressionV4Attr) (selected_indices tf.Output, valid_outputs tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "NonMaxSuppressionV4",
+		Input: []tf.Input{
+			boxes, scores, max_output_size, iou_threshold, score_threshold,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Serializes the tree ensemble to a proto.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble.
+//
+// Returns Stamp token of the tree ensemble resource.Serialized proto of the ensemble.
+func BoostedTreesSerializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, tree_ensemble_serialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesSerializeEnsemble",
+		Input: []tf.Input{
+			tree_ensemble_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Creates a dataset that contains the elements of `input_dataset` ignoring errors.
+func ExperimentalIgnoreErrorsDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalIgnoreErrorsDataset",
+		Input: []tf.Input{
+			input_dataset,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // CombinedNonMaxSuppressionAttr is an optional argument to CombinedNonMaxSuppression.
 type CombinedNonMaxSuppressionAttr func(optionalAttr)
 
@@ -39393,60 +39613,152 @@ func CombinedNonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output,
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// FakeQuantWithMinMaxArgsAttr is an optional argument to FakeQuantWithMinMaxArgs.
-type FakeQuantWithMinMaxArgsAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxArgsMin sets the optional min attribute to value.
-// If not specified, defaults to -6
-func FakeQuantWithMinMaxArgsMin(value float32) FakeQuantWithMinMaxArgsAttr {
-	return func(m optionalAttr) {
-		m["min"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsMax sets the optional max attribute to value.
-// If not specified, defaults to 6
-func FakeQuantWithMinMaxArgsMax(value float32) FakeQuantWithMinMaxArgsAttr {
-	return func(m optionalAttr) {
-		m["max"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxArgsNumBits(value int64) FakeQuantWithMinMaxArgsAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsNarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxArgsNarrowRange(value bool) FakeQuantWithMinMaxArgsAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
-	}
-}
-
-// Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.
+// Compute the polygamma function \\(\psi^{(n)}(x)\\).
 //
-// Attributes `[min; max]` define the clamping range for the `inputs` data.
-// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
-// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
-// then de-quantized and output as floats in `[min; max]` interval.
-// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
+// The polygamma function is defined as:
 //
-// Before quantization, `min` and `max` values are adjusted with the following
-// logic.
-// It is suggested to have `min <= 0 <= max`. If `0` is not in the range of values,
-// the behavior can be unexpected:
-// If `0 < min < max`: `min_adj = 0` and `max_adj = max - min`.
-// If `min < max < 0`: `min_adj = min - max` and `max_adj = 0`.
-// If `min <= 0 <= max`: `scale = (max - min) / (2^num_bits - 1) `,
-// `min_adj = scale * round(min / scale)` and `max_adj = max + min_adj - min`.
 //
-// Quantization is called fake since the output is still in floating point.
-func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsAttr) (outputs tf.Output) {
+// \\(\psi^{(a)}(x) = \frac{d^a}{dx^a} \psi(x)\\)
+//
+// where \\(\psi(x)\\) is the digamma function.
+// The polygamma function is defined only for non-negative integer orders \\a\\.
+func Polygamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Polygamma",
+		Input: []tf.Input{
+			a, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Says whether the targets are in the top `K` predictions.
+//
+// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+// prediction for the target class is among the top `k` predictions among
+// all predictions for example `i`. Note that the behavior of `InTopK` differs
+// from the `TopK` op in its handling of ties; if multiple classes have the
+// same prediction value and straddle the top-`k` boundary, all of those
+// classes are considered to be in the top `k`.
+//
+// More formally, let
+//
+//   \\(predictions_i\\) be the predictions for all classes for example `i`,
+//   \\(targets_i\\) be the target class for example `i`,
+//   \\(out_i\\) be the output for example `i`,
+//
+// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+//
+// Arguments:
+//	predictions: A `batch_size` x `classes` tensor.
+//	targets: A `batch_size` vector of class ids.
+//	k: Number of top elements to look at for computing precision.
+//
+// Returns Computed precision at `k` as a `bool Tensor`.
+func InTopKV2(scope *Scope, predictions tf.Output, targets tf.Output, k tf.Output) (precision tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InTopKV2",
+		Input: []tf.Input{
+			predictions, targets, k,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AvgPool3DAttr is an optional argument to AvgPool3D.
+type AvgPool3DAttr func(optionalAttr)
+
+// AvgPool3DDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DDataFormat(value string) AvgPool3DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs 3D average pooling on the input.
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The average pooled output tensor.
+func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AvgPool3D",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RaggedRangeAttr is an optional argument to RaggedRange.
+type RaggedRangeAttr func(optionalAttr)
+
+// RaggedRangeTsplits sets the optional Tsplits attribute to value.
+// If not specified, defaults to DT_INT64
+func RaggedRangeTsplits(value tf.DataType) RaggedRangeAttr {
+	return func(m optionalAttr) {
+		m["Tsplits"] = value
+	}
+}
+
+// Returns a `RaggedTensor` containing the specified sequences of numbers.
+//
+//
+// Returns a `RaggedTensor` `result` composed from `rt_dense_values` and
+// `rt_nested_splits`, such that
+// `result[i] = range(starts[i], limits[i], deltas[i])`.
+//
+// ```python
+// >>> (rt_nested_splits, rt_dense_values) = gen_ragged_ops.ragged_range(
+// ...     starts=[2, 5, 8], limits=[3, 5, 12], deltas=1)
+// >>> result = ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
+// >>> print result.eval().tolist()
+// [[2],               # result[0] = range(2, 3)
+//  [],                # result[1] = range(5, 5)
+//  [8, 9, 10, 11]]    # result[2] = range(8, 12)
+// ```
+//
+// The input tensors `starts`, `limits`, and `deltas` may be scalars or vectors.
+// The vector inputs must all have the same size.  Scalar inputs are broadcast
+// to match the size of the vector inputs.
+//
+// Arguments:
+//	starts: The starts of each range.
+//	limits: The limits of each range.
+//	deltas: The deltas of each range.
+//
+// Returns The `row_splits` for the returned `RaggedTensor`.The `flat_values` for the returned `RaggedTensor`.
+func RaggedRange(scope *Scope, starts tf.Output, limits tf.Output, deltas tf.Output, optional ...RaggedRangeAttr) (rt_nested_splits tf.Output, rt_dense_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -39455,54 +39767,14 @@ func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQua
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxArgs",
+		Type: "RaggedRange",
 		Input: []tf.Input{
-			inputs,
+			starts, limits, deltas,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that contains `count` elements from the `input_dataset`.
-//
-// Arguments:
-//
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be taken. A value of `-1` indicates that all of `input_dataset`
-// is taken.
-//
-//
-func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "TakeDataset",
-		Input: []tf.Input{
-			input_dataset, count,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes cos of x element-wise.
-func Cos(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Cos",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
 // Creates a dataset that shards the input dataset.
@@ -39537,110 +39809,16 @@ func ExperimentalAutoShardDataset(scope *Scope, input_dataset tf.Output, num_wor
 	return op.Output(0)
 }
 
-// Calculates gains for each feature and returns the best possible split information for the feature.
-//
-// The split information is the best threshold (bucket id), gains and left/right node contributions per node for each feature.
-//
-// It is possible that not all nodes can be split on each feature. Hence, the list of possible nodes can differ between the features. Therefore, we return `node_ids_list` for each feature, containing the list of nodes that this feature can be used to split.
-//
-// In this manner, the output is the best split per features and per node, so that it needs to be combined later to produce the best split for each node (among all possible features).
-//
-// The length of output lists are all of the same length, `num_features`.
-// The output shapes are compatible in a way that the first dimension of all tensors of all lists are the same and equal to the number of possible split nodes for each feature.
-//
-// Arguments:
-//	node_id_range: A Rank 1 tensor (shape=[2]) to specify the range [first, last) of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1])` (Note that the last index node_id_range[1] is exclusive).
-//	stats_summary_list: A list of Rank 3 tensor (#shape=[max_splits, bucket, 2]) for accumulated stats summary (gradient/hessian) per node per buckets for each feature. The first dimension of the tensor is the maximum number of splits, and thus not all elements of it will be used, but only the indexes specified by node_ids will be used.
-//	l1: l1 regularization factor on leaf weights, per instance based.
-//	l2: l2 regularization factor on leaf weights, per instance based.
-//	tree_complexity: adjustment to the gain, per leaf based.
-//	min_node_weight: mininum avg of hessians in a node before required for the node to be considered for splitting.
-//	max_splits: the number of nodes that can be split in the whole tree. Used as a dimension of output tensors.
-//
-// Returns An output list of Rank 1 tensors indicating possible split node ids for each feature. The length of the list is num_features, but each tensor has different size as each feature provides different possible nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the best gains for each feature to split for certain nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the bucket id to compare with (as a threshold) for split in each node. See above for details like shapes and sizes.A list of Rank 2 tensors indicating the contribution of the left nodes when branching from parent nodes (given by the tensor element in the output node_ids_list) to the left direction by the given threshold for each feature. This value will be used to make the left node value by adding to the parent node value. Second dimension size is 1 for 1-dimensional logits, but would be larger for multi-class problems. See above for details like shapes and sizes.A list of Rank 2 tensors, with the same shape/conditions as left_node_contribs_list, but just that the value is for the right node.
-func BoostedTreesCalculateBestGainsPerFeature(scope *Scope, node_id_range tf.Output, stats_summary_list []tf.Output, l1 tf.Output, l2 tf.Output, tree_complexity tf.Output, min_node_weight tf.Output, max_splits int64) (node_ids_list []tf.Output, gains_list []tf.Output, thresholds_list []tf.Output, left_node_contribs_list []tf.Output, right_node_contribs_list []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"max_splits": max_splits}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesCalculateBestGainsPerFeature",
-		Input: []tf.Input{
-			node_id_range, tf.OutputList(stats_summary_list), l1, l2, tree_complexity, min_node_weight,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if node_ids_list, idx, err = makeOutputList(op, idx, "node_ids_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
-	}
-	if gains_list, idx, err = makeOutputList(op, idx, "gains_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
-	}
-	if thresholds_list, idx, err = makeOutputList(op, idx, "thresholds_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
-	}
-	if left_node_contribs_list, idx, err = makeOutputList(op, idx, "left_node_contribs_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
-	}
-	if right_node_contribs_list, idx, err = makeOutputList(op, idx, "right_node_contribs_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
-	}
-	return node_ids_list, gains_list, thresholds_list, left_node_contribs_list, right_node_contribs_list
-}
-
-// Writes the given dataset to the given file using the TFRecord format.
-//
-// Arguments:
-//	input_dataset: A variant tensor representing the dataset to write.
-//	filename: A scalar string tensor representing the filename to use.
-//	compression_type: A scalar string tensor containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-//
-// Returns the created operation.
-func ExperimentalDatasetToTFRecord(scope *Scope, input_dataset tf.Output, filename tf.Output, compression_type tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalDatasetToTFRecord",
-		Input: []tf.Input{
-			input_dataset, filename, compression_type,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Creates a dataset that batches input elements into a SparseTensor.
-//
-// Arguments:
-//	input_dataset: A handle to an input dataset. Must have a single component.
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//	row_shape: A vector representing the dense shape of each row in the produced
-// SparseTensor. The shape may be partially specified, using `-1` to indicate
-// that a particular dimension should use the maximum size of all batch elements.
-//
-//
-func ExperimentalDenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, row_shape tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Records the bytes size of each element of `input_dataset` in a StatsAggregator.
+func ExperimentalBytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalDenseToSparseBatchDataset",
+		Type: "ExperimentalBytesProducedStatsDataset",
 		Input: []tf.Input{
-			input_dataset, batch_size, row_shape,
+			input_dataset, tag,
 		},
 		Attrs: attrs,
 	}
@@ -39648,45 +39826,99 @@ func ExperimentalDenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output
 	return op.Output(0)
 }
 
-// MaxPoolAttr is an optional argument to MaxPool.
-type MaxPoolAttr func(optionalAttr)
-
-// MaxPoolDataFormat sets the optional data_format attribute to value.
+// Returns the cardinality of `input_dataset`.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolDataFormat(value string) MaxPoolAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs max pooling on the input.
+// Returns the cardinality of `input_dataset`.
 //
 // Arguments:
-//	input: 4-D input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	input_dataset: A variant tensor representing the dataset to return cardinality for.
 //
-// Returns The max pooled output tensor.
-func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolAttr) (output tf.Output) {
+// Returns The cardinality of `input_dataset`. Named constants are used to represent
+// infinite and unknown cardinality.
+func ExperimentalDatasetCardinality(scope *Scope, input_dataset tf.Output) (cardinality tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalDatasetCardinality",
+		Input: []tf.Input{
+			input_dataset,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MfccAttr is an optional argument to Mfcc.
+type MfccAttr func(optionalAttr)
+
+// MfccUpperFrequencyLimit sets the optional upper_frequency_limit attribute to value.
+//
+// value: The highest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 4000
+func MfccUpperFrequencyLimit(value float32) MfccAttr {
+	return func(m optionalAttr) {
+		m["upper_frequency_limit"] = value
+	}
+}
+
+// MfccLowerFrequencyLimit sets the optional lower_frequency_limit attribute to value.
+//
+// value: The lowest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 20
+func MfccLowerFrequencyLimit(value float32) MfccAttr {
+	return func(m optionalAttr) {
+		m["lower_frequency_limit"] = value
+	}
+}
+
+// MfccFilterbankChannelCount sets the optional filterbank_channel_count attribute to value.
+//
+// value: Resolution of the Mel bank used internally.
+// If not specified, defaults to 40
+func MfccFilterbankChannelCount(value int64) MfccAttr {
+	return func(m optionalAttr) {
+		m["filterbank_channel_count"] = value
+	}
+}
+
+// MfccDctCoefficientCount sets the optional dct_coefficient_count attribute to value.
+//
+// value: How many output channels to produce per time slice.
+// If not specified, defaults to 13
+func MfccDctCoefficientCount(value int64) MfccAttr {
+	return func(m optionalAttr) {
+		m["dct_coefficient_count"] = value
+	}
+}
+
+// Transforms a spectrogram into a form that's useful for speech recognition.
+//
+// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
+// been effective as an input feature for machine learning. They are created by
+// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
+// higher frequencies that are less significant to the human ear. They have a long
+// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
+// is a good resource to learn more.
+//
+// Arguments:
+//	spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
+// set to true.
+//	sample_rate: How many samples per second the source audio used.
+func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional ...MfccAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool",
+		Type: "Mfcc",
 		Input: []tf.Input{
-			input,
+			spectrogram, sample_rate,
 		},
 		Attrs: attrs,
 	}
@@ -39694,6 +39926,24 @@ func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padd
 	return op.Output(0)
 }
 
+// Returns the number of work units this Reader has finished processing.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderNumWorkUnitsCompletedV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // A substitute for `InterleaveDataset` on a fixed list of `N` datasets.
 //
 // Arguments:
@@ -39719,30 +39969,21 @@ func ExperimentalDirectedInterleaveDataset(scope *Scope, selector_input_dataset
 	return op.Output(0)
 }
 
-// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
-type DestroyResourceOpAttr func(optionalAttr)
+// ShapeNAttr is an optional argument to ShapeN.
+type ShapeNAttr func(optionalAttr)
 
-// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
-//
-// value: whether to ignore the error when the resource
-// doesn't exist.
-// If not specified, defaults to true
-func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
+// ShapeNOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func ShapeNOutType(value tf.DataType) ShapeNAttr {
 	return func(m optionalAttr) {
-		m["ignore_lookup_error"] = value
+		m["out_type"] = value
 	}
 }
 
-// Deletes the resource specified by the handle.
+// Returns shape of tensors.
 //
-// All subsequent operations using the resource will result in a NotFound
-// error status.
-//
-// Arguments:
-//	resource: handle to the resource to delete.
-//
-// Returns the created operation.
-func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
+// This operation returns N 1-D integer tensors representing shape of `input[i]s`.
+func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -39751,177 +39992,156 @@ func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyReso
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DestroyResourceOp",
+		Type: "ShapeN",
 		Input: []tf.Input{
-			resource,
+			tf.OutputList(input),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("ShapeN", err)
+		return
+	}
+	return output
+}
+
+// Creates a dataset that changes the batch size.
+//
+// Creates a dataset that changes the batch size of the dataset to current batch
+// size // num_workers.
+//
+// Arguments:
+//	input_dataset: A variant tensor representing the input dataset.
+//	num_workers: A scalar representing the number of workers to distribute this batch across. As
+// a result of this transformation the current batch size would end up being
+// divided  by this parameter.
+//
+//
+func ExperimentalRebatchDataset(scope *Scope, input_dataset tf.Output, num_workers tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalRebatchDataset",
+		Input: []tf.Input{
+			input_dataset, num_workers,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Compute the pairwise cross product.
+//
+// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
+// or any shape where the innermost dimension is 3. In the latter case, each pair
+// of corresponding 3-element vectors is cross-multiplied independently.
+//
+// Arguments:
+//	a: A tensor containing 3-element vectors.
+//	b: Another tensor, of same type and shape as `a`.
+//
+// Returns Pairwise cross product of the vectors in `a` and `b`.
+func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cross",
+		Input: []tf.Input{
+			a, b,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that executes a SQL query and emits rows of the result set.
+//
+// Arguments:
+//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
+//	data_source_name: A connection string to connect to the database.
+//	query: A SQL query to execute.
+//
+//
+func ExperimentalSqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalSqlDataset",
+		Input: []tf.Input{
+			driver_name, data_source_name, query,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingMomentumParametersAttr is an optional argument to LoadTPUEmbeddingMomentumParameters.
+type LoadTPUEmbeddingMomentumParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingMomentumParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingMomentumParametersTableId(value int64) LoadTPUEmbeddingMomentumParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingMomentumParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingMomentumParametersTableName(value string) LoadTPUEmbeddingMomentumParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load Momentum embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the Momentum optimization algorithm.
+//	momenta: Value of momenta used in the Momentum optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingMomentumParameters(scope *Scope, parameters tf.Output, momenta tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingMomentumParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingMomentumParameters",
+		Input: []tf.Input{
+			parameters, momenta,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// StatelessMultinomialAttr is an optional argument to StatelessMultinomial.
-type StatelessMultinomialAttr func(optionalAttr)
-
-// StatelessMultinomialOutputDtype sets the optional output_dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func StatelessMultinomialOutputDtype(value tf.DataType) StatelessMultinomialAttr {
-	return func(m optionalAttr) {
-		m["output_dtype"] = value
-	}
-}
-
-// Draws samples from a multinomial distribution.
-//
-// Arguments:
-//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
-// represents the unnormalized log probabilities for all classes.
-//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
-//	seed: 2 seeds (shape [2]).
-//
-// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
-// contains the drawn class labels with range `[0, num_classes)`.
-func StatelessMultinomial(scope *Scope, logits tf.Output, num_samples tf.Output, seed tf.Output, optional ...StatelessMultinomialAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessMultinomial",
-		Input: []tf.Input{
-			logits, num_samples, seed,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ExperimentalParseExampleDatasetAttr is an optional argument to ExperimentalParseExampleDataset.
-type ExperimentalParseExampleDatasetAttr func(optionalAttr)
-
-// ExperimentalParseExampleDatasetSloppy sets the optional sloppy attribute to value.
-// If not specified, defaults to false
-func ExperimentalParseExampleDatasetSloppy(value bool) ExperimentalParseExampleDatasetAttr {
-	return func(m optionalAttr) {
-		m["sloppy"] = value
-	}
-}
-
-// Transforms `input_dataset` containing `Example` protos as vectors of DT_STRING into a dataset of `Tensor` or `SparseTensor` objects representing the parsed features.
-//
-// Arguments:
-//
-//
-//	dense_defaults: A dict mapping string keys to `Tensor`s.
-// The keys of the dict must match the dense_keys of the feature.
-//	sparse_keys: A list of string keys in the examples features.
-// The results for these keys will be returned as `SparseTensor` objects.
-//	dense_keys: A list of Ndense string Tensors (scalars).
-// The keys expected in the Examples features associated with dense values.
-//	sparse_types: A list of `DTypes` of the same length as `sparse_keys`.
-// Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
-// and `tf.string` (`BytesList`) are supported.
-//	dense_shapes: List of tuples with the same length as `dense_keys`.
-// The shape of the data for each dense feature referenced by `dense_keys`.
-// Required for any input tensors identified by `dense_keys`.  Must be
-// either fully defined, or may contain an unknown first dimension.
-// An unknown first dimension means the feature is treated as having
-// a variable number of blocks, and the output shape along this dimension
-// is considered unknown at graph build time.  Padding is applied for
-// minibatch elements smaller than the maximum number of blocks for the
-// given feature along this dimension.
-//	output_types: The type list for the return values.
-//	output_shapes: The list of shapes being produced.
-func ExperimentalParseExampleDataset(scope *Scope, input_dataset tf.Output, num_parallel_calls tf.Output, dense_defaults []tf.Output, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ExperimentalParseExampleDatasetAttr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes, "output_types": output_types, "output_shapes": output_shapes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalParseExampleDataset",
-		Input: []tf.Input{
-			input_dataset, num_parallel_calls, tf.OutputList(dense_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResizeBilinearAttr is an optional argument to ResizeBilinear.
-type ResizeBilinearAttr func(optionalAttr)
-
-// ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// ResizeBilinearHalfPixelCenters sets the optional half_pixel_centers attribute to value.
-// If not specified, defaults to false
-func ResizeBilinearHalfPixelCenters(value bool) ResizeBilinearAttr {
-	return func(m optionalAttr) {
-		m["half_pixel_centers"] = value
-	}
-}
-
-// Resize `images` to `size` using bilinear interpolation.
-//
-// Input images can be of different types but output images are always float.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBilinearAttr) (resized_images tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResizeBilinear",
-		Input: []tf.Input{
-			images, size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Produces a summary of any statistics recorded by the given statistics manager.
-func ExperimentalStatsAggregatorSummary(scope *Scope, iterator tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalStatsAggregatorSummary",
-		Input: []tf.Input{
-			iterator,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Creates a dataset that contains the unique elements of `input_dataset`.
 func ExperimentalUniqueDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
@@ -39939,261 +40159,21 @@ func ExperimentalUniqueDataset(scope *Scope, input_dataset tf.Output, output_typ
 	return op.Output(0)
 }
 
-// ResourceGatherAttr is an optional argument to ResourceGather.
-type ResourceGatherAttr func(optionalAttr)
-
-// ResourceGatherBatchDims sets the optional batch_dims attribute to value.
-// If not specified, defaults to 0
-func ResourceGatherBatchDims(value int64) ResourceGatherAttr {
-	return func(m optionalAttr) {
-		m["batch_dims"] = value
-	}
-}
-
-// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Gather slices from the variable pointed to by `resource` according to `indices`.
-//
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
-//
-// ```python
-//     # Scalar indices
-//     output[:, ..., :] = params[indices, :, ... :]
-//
-//     # Vector indices
-//     output[i, :, ..., :] = params[indices[i], :, ... :]
-//
-//     # Higher rank indices
-//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
-// ```
-func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceGather",
-		Input: []tf.Input{
-			resource, indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Deserializes a serialized tree ensemble config and replaces current tree
-//
-// ensemble.
-//
-// Arguments:
-//	tree_ensemble_handle: Handle to the tree ensemble.
-//	stamp_token: Token to use as the new value of the resource stamp.
-//	tree_ensemble_serialized: Serialized proto of the ensemble.
-//
-// Returns the created operation.
-func BoostedTreesDeserializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
+// Returns the name of the device on which `resource` has been placed.
+func ExperimentalIteratorGetDevice(scope *Scope, resource tf.Output) (device tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesDeserializeEnsemble",
+		Type: "ExperimentalIteratorGetDevice",
 		Input: []tf.Input{
-			tree_ensemble_handle, stamp_token, tree_ensemble_serialized,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-//     Adds v into specified rows of x.
-//
-//     Computes y = x; y[i, :] += v; return y.
-//
-// Arguments:
-//	x: A `Tensor` of type T.
-//	i: A vector. Indices into the left-most dimension of `x`.
-//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
-//
-// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
-func InplaceAdd(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InplaceAdd",
-		Input: []tf.Input{
-			x, i, v,
+			resource,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Restore a Reader to its initial clean state.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-//
-// Returns the created operation.
-func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderResetV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Creates a dataset that emits each dim-0 slice of `components` once.
-func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "TensorSliceDataset",
-		Input: []tf.Input{
-			tf.OutputList(components),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that zips together `input_datasets`.
-func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ZipDataset",
-		Input: []tf.Input{
-			tf.OutputList(input_datasets),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
-//
-// N is the size of the segment being reduced.
-//
-// Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSqrtNWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtNWithNumSegments",
-		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that skips `count` elements from the `input_dataset`.
-//
-// Arguments:
-//
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be skipped.  If count is -1, skips everything.
-//
-//
-func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "SkipDataset",
-		Input: []tf.Input{
-			input_dataset, count,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// BoostedTreesQuantileStreamResourceFlushAttr is an optional argument to BoostedTreesQuantileStreamResourceFlush.
-type BoostedTreesQuantileStreamResourceFlushAttr func(optionalAttr)
-
-// BoostedTreesQuantileStreamResourceFlushGenerateQuantiles sets the optional generate_quantiles attribute to value.
-//
-// value: bool; If True, the output will be the num_quantiles for each stream where the ith
-// entry is the ith quantile of the input with an approximation error of epsilon.
-// Duplicate values may be present.
-// If False, the output will be the points in the histogram that we got which roughly
-// translates to 1/epsilon boundaries and without any duplicates.
-// Default to False.
-// If not specified, defaults to false
-func BoostedTreesQuantileStreamResourceFlushGenerateQuantiles(value bool) BoostedTreesQuantileStreamResourceFlushAttr {
-	return func(m optionalAttr) {
-		m["generate_quantiles"] = value
-	}
-}
-
-// Flush the summaries for a quantile stream resource.
-//
-// An op that flushes the summaries for a quantile stream resource.
-//
-// Arguments:
-//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
-//	num_buckets: int; approximate number of buckets unless using generate_quantiles.
-//
-// Returns the created operation.
-func BoostedTreesQuantileStreamResourceFlush(scope *Scope, quantile_stream_resource_handle tf.Output, num_buckets tf.Output, optional ...BoostedTreesQuantileStreamResourceFlushAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesQuantileStreamResourceFlush",
-		Input: []tf.Input{
-			quantile_stream_resource_handle, num_buckets,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Creates a dataset that overrides the maximum intra-op parallelism.
 //
 // Arguments:
@@ -40217,53 +40197,22 @@ func ExperimentalMaxIntraOpParallelismDataset(scope *Scope, input_dataset tf.Out
 	return op.Output(0)
 }
 
-// IteratorFromStringHandleAttr is an optional argument to IteratorFromStringHandle.
-type IteratorFromStringHandleAttr func(optionalAttr)
-
-// IteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
-//
-// value: If specified, defines the type of each tuple component in an
-// element produced by the resulting iterator.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromStringHandleAttr {
-	return func(m optionalAttr) {
-		m["output_types"] = value
-	}
-}
-
-// IteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
-//
-// value: If specified, defines the shape of each tuple component in an
-// element produced by the resulting iterator.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func IteratorFromStringHandleOutputShapes(value []tf.Shape) IteratorFromStringHandleAttr {
-	return func(m optionalAttr) {
-		m["output_shapes"] = value
-	}
-}
-
-// Converts the given string representing a handle to an iterator to a resource.
+// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
 //
 // Arguments:
-//	string_handle: A string representation of the given handle.
 //
-// Returns A handle to an iterator resource.
-func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...IteratorFromStringHandleAttr) (resource_handle tf.Output) {
+//	num_threads: Identifies the number of threads to use for the private threadpool.
+//
+//
+func ExperimentalPrivateThreadPoolDataset(scope *Scope, input_dataset tf.Output, num_threads tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "IteratorFromStringHandle",
+		Type: "ExperimentalPrivateThreadPoolDataset",
 		Input: []tf.Input{
-			string_handle,
+			input_dataset, num_threads,
 		},
 		Attrs: attrs,
 	}
@@ -40271,167 +40220,290 @@ func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ..
 	return op.Output(0)
 }
 
-// RetrieveTPUEmbeddingAdagradParametersAttr is an optional argument to RetrieveTPUEmbeddingAdagradParameters.
-type RetrieveTPUEmbeddingAdagradParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingAdagradParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
+// Creates a dataset that contains `rate` elements from the `input_dataset`.
 //
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingAdagradParametersTableId(value int64) RetrieveTPUEmbeddingAdagradParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingAdagradParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingAdagradParametersTableName(value string) RetrieveTPUEmbeddingAdagradParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve Adagrad embedding parameters.
+// Arguments:
 //
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
+//	rate: A scalar representing the sample rate of elements from the `input_dataset`
+// that should be taken.
+//	seed: A scalar representing seed of random number generator.
+//	seed2: A scalar representing seed2 of random number generator.
 //
-// Returns Parameter parameters updated by the Adagrad optimization algorithm.Parameter accumulators updated by the Adagrad optimization algorithm.
-func RetrieveTPUEmbeddingAdagradParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdagradParametersAttr) (parameters tf.Output, accumulators tf.Output) {
+//
+func SamplingDataset(scope *Scope, input_dataset tf.Output, rate tf.Output, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingAdagradParameters",
-
+		Type: "SamplingDataset",
+		Input: []tf.Input{
+			input_dataset, rate, seed, seed2,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingFTRLParametersGradAccumDebug.
-type RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr func(optionalAttr)
+// DecodeProtoV2Attr is an optional argument to DecodeProtoV2.
+type DecodeProtoV2Attr func(optionalAttr)
 
-// RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
+// DecodeProtoV2DescriptorSource sets the optional descriptor_source attribute to value.
 //
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr {
+// value: Either the special value `local://` or a path to a file containing
+// a serialized `FileDescriptorSet`.
+// If not specified, defaults to "local://"
+func DecodeProtoV2DescriptorSource(value string) DecodeProtoV2Attr {
 	return func(m optionalAttr) {
-		m["table_id"] = value
+		m["descriptor_source"] = value
 	}
 }
 
-// RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr {
+// DecodeProtoV2MessageFormat sets the optional message_format attribute to value.
+//
+// value: Either `binary` or `text`.
+// If not specified, defaults to "binary"
+func DecodeProtoV2MessageFormat(value string) DecodeProtoV2Attr {
 	return func(m optionalAttr) {
-		m["table_name"] = value
+		m["message_format"] = value
 	}
 }
 
-// Retrieve FTRL embedding parameters with debug support.
+// DecodeProtoV2Sanitize sets the optional sanitize attribute to value.
 //
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the FTRL optimization algorithm.Parameter accumulators updated by the FTRL optimization algorithm.Parameter linears updated by the FTRL optimization algorithm.Parameter gradient_accumulators updated by the FTRL optimization algorithm.
-func RetrieveTPUEmbeddingFTRLParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, linears tf.Output, gradient_accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingFTRLParametersGradAccumDebug",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
-type ResourceApplyRMSPropAttr func(optionalAttr)
-
-// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
+// value: Whether to sanitize the result or not.
 // If not specified, defaults to false
-func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
+func DecodeProtoV2Sanitize(value bool) DecodeProtoV2Attr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["sanitize"] = value
 	}
 }
 
-// Update '*var' according to the RMSProp algorithm.
+// The op extracts fields from a serialized protocol buffers message into tensors.
 //
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
+// The `decode_proto` op extracts fields from a serialized protocol buffers
+// message into tensors.  The fields in `field_names` are decoded and converted
+// to the corresponding `output_types` if possible.
 //
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+// A `message_type` name must be provided to give context for the field
+// names. The actual message descriptor can be looked up either in the
+// linked-in descriptor pool or a filename provided by the caller using
+// the `descriptor_source` attribute.
 //
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// Each output tensor is a dense tensor. This means that it is padded to
+// hold the largest number of repeated elements seen in the input
+// minibatch. (The shape is also padded by one to prevent zero-sized
+// dimensions). The actual repeat counts for each example in the
+// minibatch can be found in the `sizes` output. In many cases the output
+// of `decode_proto` is fed immediately into tf.squeeze if missing values
+// are not a concern. When using tf.squeeze, always pass the squeeze
+// dimension explicitly to avoid surprises.
+//
+// For the most part, the mapping between Proto field types and
+// TensorFlow dtypes is straightforward. However, there are a few
+// special cases:
+//
+// - A proto field that contains a submessage or group can only be converted
+// to `DT_STRING` (the serialized submessage). This is to reduce the
+// complexity of the API. The resulting string can be used as input
+// to another instance of the decode_proto op.
+//
+// - TensorFlow lacks support for unsigned integers. The ops represent uint64
+// types as a `DT_INT64` with the same twos-complement bit pattern
+// (the obvious way). Unsigned int32 values can be represented exactly by
+// specifying type `DT_INT64`, or using twos-complement if the caller
+// specifies `DT_INT32` in the `output_types` attribute.
+//
+// The `descriptor_source` attribute selects a source of protocol
+// descriptors to consult when looking up `message_type`. This may be a
+// filename containing a serialized `FileDescriptorSet` message,
+// or the special value `local://`, in which case only descriptors linked
+// into the code will be searched; the filename can be on any filesystem
+// accessible to TensorFlow.
+//
+// You can build a `descriptor_source` file using the `--descriptor_set_out`
+// and `--include_imports` options to the protocol compiler `protoc`.
+//
+// The `local://` database only covers descriptors linked into the
+// code via C++ libraries, not Python imports. You can link in a proto descriptor
+// by creating a cc_library target with alwayslink=1.
+//
+// Both binary and text proto serializations are supported, and can be
+// chosen using the `format` attribute.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
+//	bytes: Tensor of serialized protos with shape `batch_shape`.
+//	message_type: Name of the proto message type to decode.
+//	field_names: List of strings containing proto field names. An extension field can be decoded
+// by using its full name, e.g. EXT_PACKAGE.EXT_FIELD_NAME.
+//	output_types: List of TF types to use for the respective field in field_names.
 //
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
+// Returns Tensor of int32 with shape `[batch_shape, len(field_names)]`.
+// Each entry is the number of values found for the corresponding field.
+// Optional fields may have 0 or 1 values.List of tensors containing values for the corresponding field.
+// `values[i]` has datatype `output_types[i]`
+// and shape `[batch_shape, max(sizes[...,i])]`.
+func DecodeProtoV2(scope *Scope, bytes tf.Output, message_type string, field_names []string, output_types []tf.DataType, optional ...DecodeProtoV2Attr) (sizes tf.Output, values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"message_type": message_type, "field_names": field_names, "output_types": output_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyRMSProp",
+		Type: "DecodeProtoV2",
 		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad,
+			bytes,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes square of x element-wise.
-//
-// I.e., \\(y = x * x = x^2\\).
-func Square(scope *Scope, x tf.Output) (y tf.Output) {
+	op := scope.AddOperation(opspec)
 	if scope.Err() != nil {
 		return
 	}
+	var idx int
+	var err error
+	sizes = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("DecodeProtoV2", err)
+		return
+	}
+	return sizes, values
+}
+
+// Creates a dataset that emits `components` as a tuple of tensors once.
+func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Square",
+		Type: "TensorDataset",
 		Input: []tf.Input{
-			x,
+			tf.OutputList(components),
 		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that emits each dim-0 slice of `components` once.
+func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "TensorSliceDataset",
+		Input: []tf.Input{
+			tf.OutputList(components),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that contains `count` elements from the `input_dataset`.
+//
+// Arguments:
+//
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be taken. A value of `-1` indicates that all of `input_dataset`
+// is taken.
+//
+//
+func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "TakeDataset",
+		Input: []tf.Input{
+			input_dataset, count,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
+type DepthwiseConv2dNativeAttr func(optionalAttr)
+
+// DepthwiseConv2dNativeDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// DepthwiseConv2dNativeDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
+//
+// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+// and a filter / kernel tensor of shape
+// `[filter_height, filter_width, in_channels, channel_multiplier]`, containing
+// `in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
+// a different filter to each input channel (expanding from 1 channel to
+// `channel_multiplier` channels for each), then concatenates the results
+// together. Thus, the output has `in_channels * channel_multiplier` channels.
+//
+// ```
+// for k in 0..in_channels-1
+//   for q in 0..channel_multiplier-1
+//     output[b, i, j, k * channel_multiplier + q] =
+//       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
+//                         filter[di, dj, k, q]
+// ```
+//
+// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+//
+// Arguments:
+//
+//
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`.
+//	padding: The type of padding algorithm to use.
+func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DepthwiseConv2dNative",
+		Input: []tf.Input{
+			input, filter,
+		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
diff --git a/tensorflow/java/src/main/java/org/tensorflow/EagerOperation.java b/tensorflow/java/src/main/java/org/tensorflow/EagerOperation.java
index 2c1df4cdc40..9c8c59ec4e7 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/EagerOperation.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/EagerOperation.java
@@ -19,16 +19,21 @@ import java.util.concurrent.atomic.AtomicReferenceArray;
 
 /**
  * Implementation of an {@link Operation} executed eagerly.
- * 
- * <p>EagerOperation instances are valid only as long as the {@link EagerSession} they are a part of is
- * valid. Thus, if {@link EagerSession#close()} has been invoked, then methods on the EagerOperation
- * instance may fail with an {@code IllegalStateException}.
+ *
+ * <p>EagerOperation instances are valid only as long as the {@link EagerSession} they are a part of
+ * is valid. Thus, if {@link EagerSession#close()} has been invoked, then methods on the
+ * EagerOperation instance may fail with an {@code IllegalStateException}.
  *
  * <p>EagerOperation instances are thread-safe.
  */
 class EagerOperation extends AbstractOperation {
-  
-  EagerOperation(EagerSession session, long opNativeHandle, long[] outputNativeHandles, String type, String name) {
+
+  EagerOperation(
+      EagerSession session,
+      long opNativeHandle,
+      long[] outputNativeHandles,
+      String type,
+      String name) {
     this.session = session;
     this.type = type;
     this.name = name;
@@ -123,7 +128,8 @@ class EagerOperation extends AbstractOperation {
   
   private static class NativeReference extends EagerSession.NativeReference {
 
-    NativeReference(EagerSession session, EagerOperation operation, long opHandle, long[] outputHandles) {
+    NativeReference(
+        EagerSession session, EagerOperation operation, long opHandle, long[] outputHandles) {
       super(session, operation);
       this.opHandle = opHandle;
       this.outputHandles = outputHandles;
@@ -142,7 +148,7 @@ class EagerOperation extends AbstractOperation {
         opHandle = 0L;
       }
     }
-    
+
     private long opHandle;
     private final long[] outputHandles;
   }
@@ -152,7 +158,7 @@ class EagerOperation extends AbstractOperation {
   private static native void deleteTensorHandle(long handle);
   
   private static native long resolveTensorHandle(long handle);
-  
+
   private static native int outputListLength(long handle, String name);
 
   private static native int inputListLength(long handle, String name);
diff --git a/tensorflow/java/src/main/java/org/tensorflow/EagerOperationBuilder.java b/tensorflow/java/src/main/java/org/tensorflow/EagerOperationBuilder.java
index 2097f4ad4fa..7e5a9a778a4 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/EagerOperationBuilder.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/EagerOperationBuilder.java
@@ -33,8 +33,9 @@ final class EagerOperationBuilder implements OperationBuilder {
   @Override
   public EagerOperation build() {
     long[] tensorHandles = execute(nativeRef.opHandle);
-    EagerOperation operation = new EagerOperation(session, nativeRef.opHandle, tensorHandles, type, name);
-    // Release our reference to the native op handle now that we transferred its 
+    EagerOperation operation =
+        new EagerOperation(session, nativeRef.opHandle, tensorHandles, type, name);
+    // Release our reference to the native op handle now that we transferred its
     // ownership to the EagerOperation
     nativeRef.clear();
     return operation;
diff --git a/tensorflow/java/src/main/native/eager_operation_jni.cc b/tensorflow/java/src/main/native/eager_operation_jni.cc
index 15f98905796..2dbe81efd35 100644
--- a/tensorflow/java/src/main/native/eager_operation_jni.cc
+++ b/tensorflow/java/src/main/native/eager_operation_jni.cc
@@ -13,14 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/java/src/main/native/eager_operation_jni.h"
+
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
+
 #include <algorithm>
 #include <memory>
 
 #include "tensorflow/c/eager/c_api.h"
-#include "tensorflow/java/src/main/native/eager_operation_jni.h"
 #include "tensorflow/java/src/main/native/exception_jni.h"
 
 namespace {
@@ -36,8 +38,7 @@ TFE_Op* requireOp(JNIEnv* env, jlong handle) {
 
 TFE_TensorHandle* requireTensorHandle(JNIEnv* env, jlong handle) {
   if (handle == 0) {
-    throwException(env, kIllegalStateException,
-                   "EagerSession has been closed");
+    throwException(env, kIllegalStateException, "EagerSession has been closed");
     return nullptr;
   }
   return reinterpret_cast<TFE_TensorHandle*>(handle);
@@ -45,8 +46,9 @@ TFE_TensorHandle* requireTensorHandle(JNIEnv* env, jlong handle) {
 
 }  // namespace
 
-JNIEXPORT void JNICALL Java_org_tensorflow_EagerOperation_delete(
-    JNIEnv* env, jclass clazz, jlong handle) {
+JNIEXPORT void JNICALL Java_org_tensorflow_EagerOperation_delete(JNIEnv* env,
+                                                                 jclass clazz,
+                                                                 jlong handle) {
   if (handle == 0) return;
   TFE_DeleteOp(reinterpret_cast<TFE_Op*>(handle));
 }
@@ -127,8 +129,10 @@ JNIEXPORT jint JNICALL Java_org_tensorflow_EagerOperation_numDims(
   return static_cast<jint>(num_dims);
 }
 
-JNIEXPORT jlong JNICALL Java_org_tensorflow_EagerOperation_dim(
-    JNIEnv* env, jclass clazz, jlong handle, jint dim_index) {
+JNIEXPORT jlong JNICALL Java_org_tensorflow_EagerOperation_dim(JNIEnv* env,
+                                                               jclass clazz,
+                                                               jlong handle,
+                                                               jint dim_index) {
   TFE_TensorHandle* tensor_handle = requireTensorHandle(env, handle);
   if (tensor_handle == nullptr) return 0;
   TF_Status* status = TF_NewStatus();
diff --git a/tensorflow/java/src/main/native/eager_operation_jni.h b/tensorflow/java/src/main/native/eager_operation_jni.h
index c1d52bf9393..a6924f5aa3d 100644
--- a/tensorflow/java/src/main/native/eager_operation_jni.h
+++ b/tensorflow/java/src/main/native/eager_operation_jni.h
@@ -27,16 +27,16 @@ extern "C" {
  * Method:    delete
  * Signature: (J)V
  */
-JNIEXPORT void JNICALL Java_org_tensorflow_EagerOperation_delete(
-    JNIEnv *, jclass, jlong);
+JNIEXPORT void JNICALL Java_org_tensorflow_EagerOperation_delete(JNIEnv *,
+                                                                 jclass, jlong);
 
 /*
  * Class:     org_tensorflow_EagerOperation
  * Method:    deleteTensorHandle
  * Signature: (J)V
  */
-JNIEXPORT void JNICALL Java_org_tensorflow_EagerOperation_deleteTensorHandle(
-    JNIEnv *, jclass, jlong);
+JNIEXPORT void JNICALL
+Java_org_tensorflow_EagerOperation_deleteTensorHandle(JNIEnv *, jclass, jlong);
 
 /**
  * Class:     org_tensorflow_EagerOperation
@@ -67,24 +67,26 @@ JNIEXPORT jint JNICALL Java_org_tensorflow_EagerOperation_inputListLength(
  * Method:    dataType
  * Signature: (J)I
  */
-JNIEXPORT jint JNICALL Java_org_tensorflow_EagerOperation_dataType(
-    JNIEnv *, jclass, jlong);
+JNIEXPORT jint JNICALL Java_org_tensorflow_EagerOperation_dataType(JNIEnv *,
+                                                                   jclass,
+                                                                   jlong);
 
 /**
  * Class:     org_tensorflow_EagerOperation
  * Method:    numDims
  * Signature: (J)I
  */
-JNIEXPORT jint JNICALL Java_org_tensorflow_EagerOperation_numDims(
-    JNIEnv *, jclass, jlong);
+JNIEXPORT jint JNICALL Java_org_tensorflow_EagerOperation_numDims(JNIEnv *,
+                                                                  jclass,
+                                                                  jlong);
 
 /**
  * Class:     org_tensorflow_EagerOperation
  * Method:    dim
  * Signature: (JI)J
  */
-JNIEXPORT jlong JNICALL Java_org_tensorflow_EagerOperation_dim(
-    JNIEnv *, jclass, jlong, jint);
+JNIEXPORT jlong JNICALL Java_org_tensorflow_EagerOperation_dim(JNIEnv *, jclass,
+                                                               jlong, jint);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/tensorflow/java/src/test/java/org/tensorflow/EagerOperationBuilderTest.java b/tensorflow/java/src/test/java/org/tensorflow/EagerOperationBuilderTest.java
index 83b683dde6e..0f00a26dba4 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/EagerOperationBuilderTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/EagerOperationBuilderTest.java
@@ -21,9 +21,7 @@ import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;
 
-/**
- * Unit tests for {@link EagerOperationBuilder} class.
- */
+/** Unit tests for {@link EagerOperationBuilder} class. */
 @RunWith(JUnit4.class)
 public class EagerOperationBuilderTest {
 
@@ -56,10 +54,11 @@ public class EagerOperationBuilderTest {
   @Test
   public void addInputs() {
     try (EagerSession session = EagerSession.create()) {
-      Operation asrt = opBuilder(session, "Assert", "assert")
-          .addInput(TestUtil.constant(session, "Cond", true))
-          .addInputList(new Output<?>[] {TestUtil.constant(session, "Error", -1)})
-          .build();
+      Operation asrt =
+          opBuilder(session, "Assert", "assert")
+              .addInput(TestUtil.constant(session, "Cond", true))
+              .addInputList(new Output<?>[] {TestUtil.constant(session, "Error", -1)})
+              .build();
       try {
         opBuilder(session, "Const", "var").addControlInput(asrt);
         fail();
@@ -68,7 +67,7 @@ public class EagerOperationBuilderTest {
       }
     }
   }
-  
+
   @Test
   public void setDevice() {
     try (EagerSession session = EagerSession.create()) {
@@ -134,12 +133,12 @@ public class EagerOperationBuilderTest {
       opBuilder(session, "ApproximateEqual", "Float")
           .addInput(TestUtil.constant(session, "Const1", 10.00001f))
           .addInput(TestUtil.constant(session, "Const2", 10.00000f))
-          .setAttr("tolerance", 0.1f)    
+          .setAttr("tolerance", 0.1f)
           .build();
       // Missing tests: list(string), list(byte), list(bool), list(type)
     }
   }
-  
+
   private static EagerOperationBuilder opBuilder(EagerSession session, String type, String name) {
     return new EagerOperationBuilder(session, type, name);
   }
diff --git a/tensorflow/java/src/test/java/org/tensorflow/EagerOperationTest.java b/tensorflow/java/src/test/java/org/tensorflow/EagerOperationTest.java
index 4b7fdc8ccf8..41b0ed3936d 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/EagerOperationTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/EagerOperationTest.java
@@ -22,12 +22,10 @@ import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;
 
-/**
- * Unit tests for {@link EagerOperation} class.
- */
+/** Unit tests for {@link EagerOperation} class. */
 @RunWith(JUnit4.class)
 public class EagerOperationTest {
-  
+
   @Test
   public void failToCreateIfSessionIsClosed() {
     EagerSession session = EagerSession.create();
@@ -39,21 +37,22 @@ public class EagerOperationTest {
       // expected
     }
   }
-  
+
   @Test
   public void outputDataTypeAndShape() {
     try (EagerSession session = EagerSession.create();
         Tensor<Integer> t = Tensors.create(new int[2][3])) {
-      EagerOperation op = opBuilder(session, "Const", "OutputAttrs")
-          .setAttr("dtype", DataType.INT32)
-          .setAttr("value", t)
-          .build();
+      EagerOperation op =
+          opBuilder(session, "Const", "OutputAttrs")
+              .setAttr("dtype", DataType.INT32)
+              .setAttr("value", t)
+              .build();
       assertEquals(DataType.INT32, op.dtype(0));
       assertEquals(2, op.shape(0)[0]);
       assertEquals(3, op.shape(0)[1]);
     }
   }
-  
+
   @Test
   public void outputTensor() {
     try (EagerSession session = EagerSession.create()) {
@@ -76,17 +75,19 @@ public class EagerOperationTest {
       Output<Float> c1 = TestUtil.constant(session, "Const1", new float[] {1f, 2f});
       Output<Float> c2 = TestUtil.constant(session, "Const2", new float[] {3f, 4f});
 
-      EagerOperation acc = opBuilder(session, "AddN", "InputListLength")
-          .addInputList(new Output<?>[] {c1, c2})
-          .build();
+      EagerOperation acc =
+          opBuilder(session, "AddN", "InputListLength")
+              .addInputList(new Output<?>[] {c1, c2})
+              .build();
       assertEquals(2, acc.inputListLength("inputs"));
       assertEquals(1, acc.outputListLength("sum"));
 
-      EagerOperation split = opBuilder(session, "Split", "OutputListLength")
-          .addInput(TestUtil.constant(session, "Axis", 0))
-          .addInput(c1)
-          .setAttr("num_split", 2)
-          .build();
+      EagerOperation split =
+          opBuilder(session, "Split", "OutputListLength")
+              .addInput(TestUtil.constant(session, "Axis", 0))
+              .addInput(c1)
+              .setAttr("num_split", 2)
+              .build();
       assertEquals(1, split.inputListLength("split_dim"));
       assertEquals(2, split.outputListLength("output"));
 
@@ -105,19 +106,20 @@ public class EagerOperationTest {
       }
     }
   }
-  
+
   @Test
   public void numOutputs() {
     try (EagerSession session = EagerSession.create()) {
-      EagerOperation op = opBuilder(session, "UniqueWithCountsV2", "unq")
-          .addInput(TestUtil.constant(session, "Const1", new int[] {1, 2, 1}))
-          .addInput(TestUtil.constant(session, "Axis", new int[] {0}))
-          .setAttr("out_idx", DataType.INT32)
-          .build();
+      EagerOperation op =
+          opBuilder(session, "UniqueWithCountsV2", "unq")
+              .addInput(TestUtil.constant(session, "Const1", new int[] {1, 2, 1}))
+              .addInput(TestUtil.constant(session, "Axis", new int[] {0}))
+              .setAttr("out_idx", DataType.INT32)
+              .build();
       assertEquals(3, op.numOutputs());
-    }    
+    }
   }
-  
+
   @Test
   public void opNotAccessibleIfSessionIsClosed() {
     EagerSession session = EagerSession.create();
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index 7b34fa120f0..f43b8fd4c17 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -196,7 +196,6 @@ cc_library(
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/nnapi:nnapi_implementation",
-        "//tensorflow/lite/profiling:profiler",
         "//tensorflow/lite/schema:schema_fbs",
     ] + select({
         ":with_select_tf_ops": [
@@ -252,6 +251,7 @@ cc_test(
         "//tensorflow/lite/kernels/internal:tensor_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/testing:util",
+        "//third_party/eigen3",
         "@com_google_googletest//:gtest",
     ],
 )
diff --git a/tensorflow/lite/c/c_api_internal.c b/tensorflow/lite/c/c_api_internal.c
index f20ee23bd81..926d992011f 100644
--- a/tensorflow/lite/c/c_api_internal.c
+++ b/tensorflow/lite/c/c_api_internal.c
@@ -172,6 +172,8 @@ const char* TfLiteTypeGetName(TfLiteType type) {
       return "COMPLEX64";
     case kTfLiteString:
       return "STRING";
+    case kTfLiteFloat16:
+      return "FLOAT16";
   }
   return "Unknown type";
 }
diff --git a/tensorflow/lite/c/c_api_internal.h b/tensorflow/lite/c/c_api_internal.h
index d9f08be0faa..1948e1ba106 100644
--- a/tensorflow/lite/c/c_api_internal.h
+++ b/tensorflow/lite/c/c_api_internal.h
@@ -195,6 +195,11 @@ typedef struct {
   float re, im;  // real and imaginary parts, respectively.
 } TfLiteComplex64;
 
+// Half precision data type compatible with the C99 definition.
+typedef struct {
+  uint16_t data;
+} TfLiteFloat16;
+
 // Types supported by tensor
 typedef enum {
   kTfLiteNoType = 0,
@@ -207,6 +212,7 @@ typedef enum {
   kTfLiteInt16 = 7,
   kTfLiteComplex64 = 8,
   kTfLiteInt8 = 9,
+  kTfLiteFloat16 = 10,
 } TfLiteType;
 
 // Return the name of a given type, for error reporting purposes.
@@ -259,6 +265,8 @@ typedef union {
   int32_t* i32;
   int64_t* i64;
   float* f;
+  // Placeholder for 16b float type. Use uint16* in the pointer union for now.
+  TfLiteFloat16* f16;
   char* raw;
   const char* raw_const;
   uint8_t* uint8;
diff --git a/tensorflow/lite/c/c_api_internal_test.cc b/tensorflow/lite/c/c_api_internal_test.cc
index d01cf63a3e0..9a37cd9552f 100644
--- a/tensorflow/lite/c/c_api_internal_test.cc
+++ b/tensorflow/lite/c/c_api_internal_test.cc
@@ -78,6 +78,7 @@ TEST(Types, TestTypeNames) {
   };
   EXPECT_EQ(type_name(kTfLiteNoType), "NOTYPE");
   EXPECT_EQ(type_name(kTfLiteFloat32), "FLOAT32");
+  EXPECT_EQ(type_name(kTfLiteFloat16), "FLOAT16");
   EXPECT_EQ(type_name(kTfLiteInt16), "INT16");
   EXPECT_EQ(type_name(kTfLiteInt32), "INT32");
   EXPECT_EQ(type_name(kTfLiteUInt8), "UINT8");
diff --git a/tensorflow/lite/core/api/BUILD b/tensorflow/lite/core/api/BUILD
index db6b4a2d18e..17eeed6a687 100644
--- a/tensorflow/lite/core/api/BUILD
+++ b/tensorflow/lite/core/api/BUILD
@@ -17,6 +17,7 @@ cc_library(
         "error_reporter.h",
         "flatbuffer_conversions.h",
         "op_resolver.h",
+        "profiler.h",
     ],
     copts = tflite_copts(),
     deps = [
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index 2354f000a71..9d496f676f3 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -61,9 +61,8 @@ TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
       *type = kTfLiteFloat32;
       break;
     case TensorType_FLOAT16:
-      error_reporter->Report("Unimplemented data type float16 in tensor\n",
-                             tensor_type);
-      return kTfLiteError;
+      *type = kTfLiteFloat16;
+      break;
     case TensorType_INT16:
       *type = kTfLiteInt16;
       break;
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions_test.cc b/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
index 4a5de48302c..c7f8c1ad66e 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
@@ -141,6 +141,13 @@ TEST_F(FlatbufferConversionsTest, TestConvertTensorType) {
   EXPECT_EQ(kTfLiteFloat32, type);
 }
 
+TEST_F(FlatbufferConversionsTest, TestConvertTensorTypeFloat16) {
+  TfLiteType type;
+  EXPECT_EQ(kTfLiteOk,
+            ConvertTensorType(TensorType_FLOAT16, &type, &mock_reporter_));
+  EXPECT_EQ(kTfLiteFloat16, type);
+}
+
 }  // namespace tflite
 
 int main(int argc, char** argv) {
diff --git a/tensorflow/lite/core/api/profiler.h b/tensorflow/lite/core/api/profiler.h
new file mode 100644
index 00000000000..f36f8e13c3c
--- /dev/null
+++ b/tensorflow/lite/core/api/profiler.h
@@ -0,0 +1,85 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_API_PROFILER_H_
+#define TENSORFLOW_LITE_CORE_API_PROFILER_H_
+
+#include <cstdint>
+
+namespace tflite {
+
+// A simple utility for enabling profiled event tracing in TensorFlow Lite.
+class Profiler {
+ public:
+  enum class EventType {
+    // Default event type, the metadata field has no special significance.
+    DEFAULT = 0,
+    // The event is an operator invocation and the event_metadata field is the
+    // index of operator node.
+    OPERATOR_INVOKE_EVENT = 1
+  };
+
+  virtual ~Profiler() {}
+
+  // Signals the beginning of an event, returning a handle to the profile event.
+  virtual uint32_t BeginEvent(const char* tag, EventType event_type,
+                              uint32_t event_metadata) = 0;
+
+  // Signals an end to the specified profile event.
+  virtual void EndEvent(uint32_t event_handle) = 0;
+};
+
+// Adds a profile event to `profiler` that begins with the construction
+// of the object and ends when the object goes out of scope.
+// The lifetime of tag should be at least the lifetime of `profiler`.
+// `profiler` may be null, in which case nothing is profiled.
+class ScopedProfile {
+ public:
+  ScopedProfile(Profiler* profiler, const char* tag,
+                Profiler::EventType event_type = Profiler::EventType::DEFAULT,
+                uint32_t event_metadata = 0)
+      : profiler_(profiler), event_handle_(0) {
+    if (profiler) {
+      event_handle_ = profiler_->BeginEvent(tag, event_type, event_metadata);
+    }
+  }
+
+  ~ScopedProfile() {
+    if (profiler_) {
+      profiler_->EndEvent(event_handle_);
+    }
+  }
+
+ private:
+  Profiler* const profiler_;
+  uint32_t event_handle_;
+};
+
+class ScopedOperatorProfile : public ScopedProfile {
+ public:
+  ScopedOperatorProfile(Profiler* profiler, const char* tag, int node_index)
+      : ScopedProfile(profiler, tag, Profiler::EventType::OPERATOR_INVOKE_EVENT,
+                      static_cast<uint32_t>(node_index)) {}
+};
+
+}  // namespace tflite
+
+#define TFLITE_VARNAME_UNIQ(name, ctr) name##ctr
+#define TFLITE_SCOPED_TAGGED_OPERATOR_PROFILE(profiler, tag, node_index)     \
+  tflite::ScopedOperatorProfile TFLITE_VARNAME_UNIQ(_profile_, __COUNTER__)( \
+      (profiler), (tag), (node_index))
+#define TFLITE_SCOPED_OPERATOR_PROFILE(profiler, node_index) \
+  TFLITE_SCOPED_TAGGED_OPERATOR_PROFILE((profiler), "OpInvoke", (node_index))
+
+#endif  // TENSORFLOW_LITE_CORE_API_PROFILER_H_
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index 9bae0329268..afa2d63f64f 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -469,6 +469,9 @@ TfLiteStatus Subgraph::BytesRequired(TfLiteType type, const int* dims,
     case kTfLiteInt8:
       *bytes = sizeof(int8_t) * count;
       break;
+    case kTfLiteFloat16:
+      *bytes = sizeof(TfLiteFloat16) * count;
+      break;
     default:
       ReportError(
           "Only float32, int8, int16, int32, int64, uint8, bool, complex64 "
@@ -700,7 +703,7 @@ TfLiteStatus Subgraph::Invoke() {
     TfLiteNode& node = nodes_and_registration_[node_index].first;
     const TfLiteRegistration& registration =
         nodes_and_registration_[node_index].second;
-    SCOPED_OPERATOR_PROFILE(profiler_, node_index);
+    TFLITE_SCOPED_OPERATOR_PROFILE(profiler_, node_index);
 
     // TODO(ycling): This is an extra loop through inputs to check if the data
     // need to be copied from Delegate buffer to raw memory, which is often not
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index d25b640ee4c..b20cd06d686 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -20,9 +20,9 @@ limitations under the License.
 
 #include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/core/api/profiler.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/memory_planner.h"
-#include "tensorflow/lite/profiling/profiler.h"
 #include "tensorflow/lite/util.h"
 
 namespace tflite {
@@ -276,12 +276,12 @@ class Subgraph {
   // WARNING: This is an experimental API and subject to change.
   TfLiteStatus ResetVariableTensors();
 
-  void SetProfiler(profiling::Profiler* profiler) {
+  void SetProfiler(Profiler* profiler) {
     profiler_ = profiler;
     context_->profiler = profiler;
   }
 
-  profiling::Profiler* GetProfiler() { return profiler_; }
+  Profiler* GetProfiler() { return profiler_; }
 
   // Returns a pointer to vector of subgraphs.
   // WARNING: This is an experimental API and subject to change.
@@ -527,7 +527,7 @@ class Subgraph {
   TfLiteExternalContext** external_contexts_;
 
   // Profiler for this interpreter instance.
-  profiling::Profiler* profiler_ = nullptr;
+  Profiler* profiler_ = nullptr;
 
   // A pointer to vector of subgraphs. The vector is owned by the interpreter.
   std::vector<std::unique_ptr<Subgraph>>* subgraphs_ = nullptr;
diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index c2d41f6f417..43c3d5f6eb0 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -154,11 +154,11 @@ cc_library(
         ":delegate_data",
         ":util",
         "@flatbuffers",
+        "//tensorflow/lite/core/api",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:string",
         "//tensorflow/lite/kernels:kernel_util",
-        "//tensorflow/lite/profiling:profiler",
     ] + select({
         # TODO(b/111881878): The android_tensorflow_lib target pulls in the full
         # set of core TensorFlow kernels. We may want to revisit this dependency
diff --git a/tensorflow/lite/delegates/flex/buffer_map.cc b/tensorflow/lite/delegates/flex/buffer_map.cc
index 0d0c9536366..1f6df9ada73 100644
--- a/tensorflow/lite/delegates/flex/buffer_map.cc
+++ b/tensorflow/lite/delegates/flex/buffer_map.cc
@@ -15,11 +15,12 @@ limitations under the License.
 #include "tensorflow/lite/delegates/flex/buffer_map.h"
 
 #include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/core/framework/allocation_description.pb.h"
+#include "tensorflow/core/framework/log_memory.h"
+#include "tensorflow/core/framework/typed_allocator.h"
 #include "tensorflow/lite/delegates/flex/util.h"
 #include "tensorflow/lite/string.h"
 #include "tensorflow/lite/string_util.h"
-#include "tensorflow/core/framework/allocation_description.pb.h"
-#include "tensorflow/core/framework/log_memory.h"
 
 namespace tflite {
 namespace flex {
@@ -99,8 +100,9 @@ class StringTfLiteTensorBuffer : public BaseTfLiteTensorBuffer {
 
   ~StringTfLiteTensorBuffer() override {
     LogDeallocation();
-    tensorflow::cpu_allocator()->Deallocate<string>(
-        static_cast<string*>(data()), num_strings_);
+    tensorflow::TypedAllocator::Deallocate<tensorflow::string>(
+        tensorflow::cpu_allocator(), static_cast<tensorflow::string*>(data()),
+        num_strings_);
   }
 
   size_t size() const override { return num_strings_ * sizeof(string); }
@@ -109,7 +111,9 @@ class StringTfLiteTensorBuffer : public BaseTfLiteTensorBuffer {
   StringTfLiteTensorBuffer(const TfLiteTensor* tensor, int num_strings)
       : BaseTfLiteTensorBuffer(
             num_strings != 0
-                ? tensorflow::cpu_allocator()->Allocate<string>(num_strings)
+                ? tensorflow::TypedAllocator::Allocate<tensorflow::string>(
+                      tensorflow::cpu_allocator(), num_strings,
+                      tensorflow::AllocationAttributes())
                 : nullptr),
         num_strings_(num_strings) {
     LogAllocation();
diff --git a/tensorflow/lite/delegates/flex/kernel.cc b/tensorflow/lite/delegates/flex/kernel.cc
index 6443e9fe1bf..4f3d0f1dde6 100644
--- a/tensorflow/lite/delegates/flex/kernel.cc
+++ b/tensorflow/lite/delegates/flex/kernel.cc
@@ -24,10 +24,10 @@ limitations under the License.
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/core/api/profiler.h"
 #include "tensorflow/lite/delegates/flex/delegate_data.h"
 #include "tensorflow/lite/delegates/flex/util.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/profiling/profiler.h"
 #include "tensorflow/lite/string.h"
 
 // Note: this is part of TF Lite's Flex delegation code which is to be
@@ -529,8 +529,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   // Execute the TensorFlow Ops sequentially.
   for (auto& node_data : op_data->nodes) {
-    SCOPED_TAGGED_OPERATOR_PROFILE(
-        reinterpret_cast<profiling::Profiler*>(context->profiler),
+    TFLITE_SCOPED_TAGGED_OPERATOR_PROFILE(
+        reinterpret_cast<Profiler*>(context->profiler),
         node_data->name().c_str(), node_data->index());
 
     auto status = ExecuteFlexOp(context, buffer_map, node_data.get());
diff --git a/tensorflow/lite/delegates/flex/util.cc b/tensorflow/lite/delegates/flex/util.cc
index c995b360f9d..4279f4ae397 100644
--- a/tensorflow/lite/delegates/flex/util.cc
+++ b/tensorflow/lite/delegates/flex/util.cc
@@ -60,6 +60,8 @@ TF_DataType GetTensorFlowDataType(TfLiteType type) {
       return TF_FLOAT;
     case kTfLiteFloat32:
       return TF_FLOAT;
+    case kTfLiteFloat16:
+      return TF_HALF;
     case kTfLiteInt16:
       return TF_INT16;
     case kTfLiteInt32:
@@ -83,6 +85,8 @@ TfLiteType GetTensorFlowLiteType(TF_DataType type) {
   switch (type) {
     case TF_FLOAT:
       return kTfLiteFloat32;
+    case TF_HALF:
+      return kTfLiteFloat16;
     case TF_INT16:
       return kTfLiteInt16;
     case TF_INT32:
diff --git a/tensorflow/lite/delegates/flex/util_test.cc b/tensorflow/lite/delegates/flex/util_test.cc
index 87104751b81..69bba405055 100644
--- a/tensorflow/lite/delegates/flex/util_test.cc
+++ b/tensorflow/lite/delegates/flex/util_test.cc
@@ -101,9 +101,9 @@ TEST(UtilTest, CopyShapeAndType) {
 
   EXPECT_EQ(
       CopyShapeAndType(&context, Tensor(tensorflow::DT_HALF, {1, 2}), &dst),
-      kTfLiteError);
-  EXPECT_EQ(context.error,
-            "TF Lite does not support TensorFlow data type: half");
+      kTfLiteOk);
+  EXPECT_THAT(context.new_size, ElementsAre(1, 2));
+  EXPECT_EQ(dst.type, kTfLiteFloat16);
 }
 
 TEST(UtilTest, TypeConversionsFromTFLite) {
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management.cc b/tensorflow/lite/delegates/gpu/common/memory_management.cc
index 09b40aa812c..73a27a3c4ea 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management.cc
+++ b/tensorflow/lite/delegates/gpu/common/memory_management.cc
@@ -32,7 +32,7 @@ struct PoolRecord {
   PoolRecord(uint32_t size, size_t obj_id)
       : object_size(size), object_id(obj_id) {}
 
-  // objects in pool are ordered by size
+  // Objects in pool are ordered by size.
   bool operator<(const PoolRecord& other) const {
     return (object_size < other.object_size) ||
            (object_size == other.object_size && object_id < other.object_id);
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management_test.cc b/tensorflow/lite/delegates/gpu/common/memory_management_test.cc
index 93897ea2e39..6fd8b217b67 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/memory_management_test.cc
@@ -22,8 +22,6 @@ namespace tflite {
 namespace gpu {
 namespace {
 
-// using ::testing::Eq;  // Optional ::testing aliases. Remove if unused.
-// using ::testing::Test;
 using ::testing::ElementsAre;
 
 TEST(Model, EmptyRecords) {
diff --git a/tensorflow/lite/delegates/gpu/common/shape.cc b/tensorflow/lite/delegates/gpu/common/shape.cc
index 3ffc651765e..df34076313c 100644
--- a/tensorflow/lite/delegates/gpu/common/shape.cc
+++ b/tensorflow/lite/delegates/gpu/common/shape.cc
@@ -111,5 +111,15 @@ std::string ToString(const Shape& s) {
                       absl::StrJoin(s.dimensions, ", "), "}}");
 }
 
+template <>
+int64_t StrongShape<Layout::OHWI>::LinearIndex(
+    const std::array<int32_t, 4>& coordinates) const {
+  int64_t index = coordinates[0];
+  index = index * StrongShape::get(1) + coordinates[1];
+  index = index * StrongShape::get(2) + coordinates[2];
+  index = index * StrongShape::get(3) + coordinates[3];
+  return index;
+}
+
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/lstm.cc b/tensorflow/lite/delegates/gpu/gl/kernels/lstm.cc
index 2f95947db21..696d5257598 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/lstm.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/lstm.cc
@@ -45,9 +45,6 @@ class LstmNodeShader : public NodeShader {
  public:
   Status GenerateCode(const GenerationContext& ctx,
                       GeneratedCode* generated_code) const final {
-    auto inputs = ctx.graph->FindInputs(ctx.node->id);
-    auto outputs = ctx.graph->FindOutputs(ctx.node->id);
-
     std::string code = R"(
       vec4 prev_state  = $input_data_1[gid.x, gid.y, gid.z]$;
 
@@ -78,7 +75,7 @@ class LstmNodeShader : public NodeShader {
         /*objects=*/{},
         /*workload=*/uint3(),
         /*workgroup=*/uint3(),
-        /*source_code=*/code,
+        /*source_code=*/std::move(code),
         /*input=*/IOStructure::ONLY_DEFINITIONS,
         /*output=*/IOStructure::AUTO,
     };
diff --git a/tensorflow/lite/delegates/gpu/java/src/main/native/BUILD b/tensorflow/lite/delegates/gpu/java/src/main/native/BUILD
index 41387fdacda..7f8162275f3 100644
--- a/tensorflow/lite/delegates/gpu/java/src/main/native/BUILD
+++ b/tensorflow/lite/delegates/gpu/java/src/main/native/BUILD
@@ -11,7 +11,6 @@ licenses(["notice"])  # Apache 2.0
 cc_library(
     name = "native",
     srcs = ["gpu_delegate_jni.cc"],
-    hdrs = ["gpu_delegate_jni.h"],
     copts = tflite_copts(),
     linkopts = select({
         "//tensorflow:android": [
diff --git a/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc b/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc
index e0308c29df4..51e3ce130a8 100644
--- a/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc
+++ b/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc
@@ -13,10 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.h"
+#include <jni.h>
 
 #include "tensorflow/lite/delegates/gpu/gl_delegate.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
 JNIEXPORT jlong JNICALL Java_org_tensorflow_lite_gpu_GpuDelegate_createDelegate(
     JNIEnv* env, jclass clazz, jboolean precision_loss_allowed,
     jboolean dynamic_batch_enabled, jint preferred_gl_object_type) {
@@ -45,3 +49,7 @@ Java_org_tensorflow_lite_gpu_GpuDelegate_bindGlBufferToTensor(
              ? JNI_TRUE
              : JNI_FALSE;
 }
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
diff --git a/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.h b/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.h
deleted file mode 100644
index b36fd912b23..00000000000
--- a/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_DELEGATES_GPU_JAVA_SRC_MAIN_NATIVE_GPU_DELEGATE_JNI_H_
-#define TENSORFLOW_LITE_DELEGATES_GPU_JAVA_SRC_MAIN_NATIVE_GPU_DELEGATE_JNI_H_
-
-#include <jni.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-/*
- * Class:     org_tensorflow_lite_gpu_GpuDelegate
- * Method:    createDelegate
- * Signature: (ZZI)J
- */
-JNIEXPORT jlong JNICALL Java_org_tensorflow_lite_gpu_GpuDelegate_createDelegate(
-    JNIEnv* env, jclass clazz, jboolean precision_loss_allowed,
-    jboolean dynamic_batch_enabled, jint preferred_gl_object_type);
-
-/*
- * Class:     org_tensorflow_lite_gpu_GpuDelegate
- * Method:    deleteDelegate
- * Signature: (J)
- */
-JNIEXPORT void JNICALL Java_org_tensorflow_lite_gpu_GpuDelegate_deleteDelegate(
-    JNIEnv* env, jclass clazz, jlong delegate);
-
-/*
- * Class:     org_tensorflow_lite_gpu_GpuDelegate
- * Method:    bindGlBufferToTensor
- * Signature: (JII)Z
- */
-JNIEXPORT jboolean JNICALL
-Java_org_tensorflow_lite_gpu_GpuDelegate_bindGlBufferToTensor(
-    JNIEnv* env, jclass clazz, jlong delegate, jint tensor_index, jint ssbo);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
-
-#endif  // TENSORFLOW_LITE_DELEGATES_FLOW_GPU_JAVA_SRC_MAIN_NATIVE_GPU_DELEGATE_JNI_H_
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc b/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc
index f4a67dcc246..19be9d4902a 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc
@@ -701,7 +701,7 @@ kernel void ComputeFunction(
 )";
   for (int i = 0; i < z_out; ++i) {
     const std::string s_i = std::to_string(i);
-    code += "  ACCUM_FLT4 r" + s_i + " = ACCUM_FLT4(0.0f, 0.0f, 0.0f, 0.0f);\n";
+    code += "  float4 r" + s_i + " = float4(0.0f, 0.0f, 0.0f, 0.0f);\n";
   }
   code += R"(
   device FLT4* tmp = filters + gid_z * 4 * params.src_size.w;
@@ -728,7 +728,7 @@ kernel void ComputeFunction(
   )";
   for (int i = 0; i < z_out; ++i) {
     const std::string s_i = std::to_string(i);
-    code += "  r" + s_i + " += TO_ACCUM4_TYPE(bias_loc[" + s_i + "]);\n";
+    code += "  r" + s_i + " += float4(bias_loc[" + s_i + "]);\n";
   }
   for (int i = 0; i < z_out; ++i) {
     const std::string s_i = std::to_string(i);
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.cc b/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.cc
index cacbb3c8ae3..15b46541562 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.cc
@@ -468,6 +468,7 @@ std::vector<ComputeTaskDescriptorPtr> DepthWiseConvolution(
     int id, ValueId input_id, ValueId output_id,
     const DepthwiseConvolution2DAttributes& attr,
     const RuntimeOptions& options) {
+  int channels_multiplier = attr.weights.shape.o;
   auto desc = std::make_shared<ComputeTaskDescriptor>();
   desc->id = id;
   desc->is_linkable = false;
@@ -503,10 +504,44 @@ std::vector<ComputeTaskDescriptorPtr> DepthWiseConvolution(
           const bool outside = coords.x < 0 || coords.y < 0 ||
             coords.x >= params.size.x || coords.y >= params.size.y;
           if (outside) continue;
-
-          const int src_layer = gid.z;
-          const int src_index = (src_layer * params.size.y + coords.y) * params.size.x + coords.x;
-          sum0 += float4(src_buffer[src_index]) * float4(temp[ky * kernel_x + kx]);
+)";
+  if (channels_multiplier == 1) {
+    shader_source += R"(
+        const int src_layer = gid.z;
+        const int src_index = (src_layer * params.size.y + coords.y) * params.size.x + coords.x;
+        const FLT4 src_modified = src_buffer[src_index];
+)";
+  } else if (channels_multiplier == 2) {
+    shader_source += R"(
+        const int src_layer = gid.z / 2;
+        const int src_index = (src_layer * params.size.y + coords.y) * params.size.x + coords.x;
+        const FLT4 src = src_buffer[src_index];
+        const FLT2 t0 = gid.z % 2 == 0 ? src.xy : src.zw;
+        const FLT4 src_modified = FLT4(t0.x, t0.x, t0.y, t0.y);
+)";
+  } else if (channels_multiplier == 4) {
+    shader_source += R"(
+        const int src_layer = gid.z / 4;
+        const int src_index = (src_layer * params.size.y + coords.y) * params.size.x + coords.x;
+        const FLT4 src = src_buffer[src_index];
+        const FLT t0 = src[gid.z % 4];
+        const FLT4 src_modified = FLT4(t0, t0, t0, t0);
+)";
+  } else {
+    shader_source += R"(
+        const int src_layer = gid.z / params.channel_multiplier.x;
+        const int src_index = (src_layer * params.size.y + coords.y) * params.size.x + coords.x;
+        const FLT4 src = src_buffer[src_index];
+        FLT4 src_modified;
+        const int src_layer_offset = (gid.z % params.channel_multiplier.x) * 4;
+        src_modified.x = src[(src_layer_offset + 0) / params.channel_multiplier.x];
+        src_modified.y = src[(src_layer_offset + 1) / params.channel_multiplier.x];
+        src_modified.z = src[(src_layer_offset + 2) / params.channel_multiplier.x];
+        src_modified.w = src[(src_layer_offset + 3) / params.channel_multiplier.x];
+)";
+  }
+  shader_source += R"(
+          sum0 += float4(src_modified * temp[ky * kernel_x + kx]);
         }
       }
       FLT4 res = FLT4(sum0 + float4(biases[gid.z]));
@@ -531,19 +566,7 @@ std::vector<ComputeTaskDescriptorPtr> DepthWiseConvolution(
         return out_shape;
       }};
 
-  const int num_output_channels = attr.weights.shape.i * attr.weights.shape.o;
-  BHWC reordered_dims{1, attr.weights.shape.h, attr.weights.shape.w,
-                      num_output_channels};
-  std::vector<float> filters_reordered(GetElementsSizeForPHWC4(reordered_dims),
-                                       0.0f);
-  if (!ConvertToPHWC4(
-           absl::MakeConstSpan(attr.weights.data.data(),
-                               attr.weights.data.size()),
-           reordered_dims,
-           absl::MakeSpan(filters_reordered.data(), filters_reordered.size()))
-           .ok()) {
-    return {};
-  }
+  std::vector<float> filters_reordered = ConvertToPIOHW4(attr.weights);
   auto filters = options.storage_precision == RuntimeOptions::Precision::FP32
                      ? VectorToUint8Vector(filters_reordered)
                      : VectorFloatToHalf(filters_reordered);
diff --git a/tensorflow/lite/delegates/nnapi/java/src/main/native/BUILD b/tensorflow/lite/delegates/nnapi/java/src/main/native/BUILD
index 405fc2e82a3..4c12ef344d5 100644
--- a/tensorflow/lite/delegates/nnapi/java/src/main/native/BUILD
+++ b/tensorflow/lite/delegates/nnapi/java/src/main/native/BUILD
@@ -11,7 +11,6 @@ licenses(["notice"])  # Apache 2.0
 cc_library(
     name = "native",
     srcs = ["nnapi_delegate_jni.cc"],
-    hdrs = ["nnapi_delegate_jni.h"],
     copts = tflite_copts(),
     tags = [
         "manual",
diff --git a/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_jni.cc b/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_jni.cc
index a4ff12b1254..d68ff5efac1 100644
--- a/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_jni.cc
+++ b/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_jni.cc
@@ -13,12 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_jni.h"
+#include <jni.h>
 
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
 JNIEXPORT jlong JNICALL
 Java_org_tensorflow_lite_nnapi_NnApiDelegate_createDelegate(JNIEnv* env,
                                                             jclass clazz) {
   return reinterpret_cast<jlong>(tflite::NnApiDelegate());
 }
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
diff --git a/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_jni.h b/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_jni.h
deleted file mode 100644
index 12cf56cab2b..00000000000
--- a/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_jni.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_DELEGATES_NNAPI_JAVA_SRC_MAIN_NATIVE_NNAPI_DELEGATE_JNI_H_
-#define TENSORFLOW_LITE_DELEGATES_NNAPI_JAVA_SRC_MAIN_NATIVE_NNAPI_DELEGATE_JNI_H_
-
-#include <jni.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-/*
- * Class:     org_tensorflow_lite_nnapi_NnApiDelegate
- * Method:    createDelegate
- * Signature: ()J
- */
-JNIEXPORT jlong JNICALL
-Java_org_tensorflow_lite_nnapi_NnApiDelegate_createDelegate(JNIEnv* env,
-                                                            jclass clazz);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
-
-#endif  // TENSORFLOW_LITE_DELEGATES_NNAPI_JAVA_SRC_MAIN_NATIVE_NNAPI_DELEGATE_JNI_H_
diff --git a/tensorflow/lite/examples/label_image/BUILD b/tensorflow/lite/examples/label_image/BUILD
index 088cd2f78ac..88e5fd2c677 100644
--- a/tensorflow/lite/examples/label_image/BUILD
+++ b/tensorflow/lite/examples/label_image/BUILD
@@ -30,6 +30,7 @@ cc_binary(
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/profiling:profiler",
     ],
 )
 
diff --git a/tensorflow/lite/examples/label_image/label_image.cc b/tensorflow/lite/examples/label_image/label_image.cc
index 340fbab5c6f..ac84e270cdb 100644
--- a/tensorflow/lite/examples/label_image/label_image.cc
+++ b/tensorflow/lite/examples/label_image/label_image.cc
@@ -13,6 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <fcntl.h>      // NOLINT(build/include_order)
+#include <getopt.h>     // NOLINT(build/include_order)
+#include <sys/time.h>   // NOLINT(build/include_order)
+#include <sys/types.h>  // NOLINT(build/include_order)
+#include <sys/uio.h>    // NOLINT(build/include_order)
+#include <unistd.h>     // NOLINT(build/include_order)
+
 #include <cstdarg>
 #include <cstdio>
 #include <cstdlib>
@@ -25,21 +32,14 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
-#include <fcntl.h>      // NOLINT(build/include_order)
-#include <getopt.h>     // NOLINT(build/include_order)
-#include <sys/time.h>   // NOLINT(build/include_order)
-#include <sys/types.h>  // NOLINT(build/include_order)
-#include <sys/uio.h>    // NOLINT(build/include_order)
-#include <unistd.h>     // NOLINT(build/include_order)
-
+#include "tensorflow/lite/examples/label_image/bitmap_helpers.h"
+#include "tensorflow/lite/examples/label_image/get_top_n.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/optional_debug_tools.h"
+#include "tensorflow/lite/profiling/profiler.h"
 #include "tensorflow/lite/string_util.h"
 
-#include "tensorflow/lite/examples/label_image/bitmap_helpers.h"
-#include "tensorflow/lite/examples/label_image/get_top_n.h"
-
 #define LOG(x) std::cerr
 
 namespace tflite {
diff --git a/tensorflow/lite/experimental/examples/lstm/BUILD b/tensorflow/lite/experimental/examples/lstm/BUILD
index e47f279e54c..42e132683b2 100644
--- a/tensorflow/lite/experimental/examples/lstm/BUILD
+++ b/tensorflow/lite/experimental/examples/lstm/BUILD
@@ -36,6 +36,7 @@ py_test(
     name = "unidirectional_sequence_lstm_test",
     size = "large",
     srcs = ["unidirectional_sequence_lstm_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -58,6 +59,7 @@ py_test(
     name = "unidirectional_sequence_rnn_test",
     size = "large",
     srcs = ["unidirectional_sequence_rnn_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -80,6 +82,7 @@ py_test(
     name = "bidirectional_sequence_lstm_test",
     size = "large",
     srcs = ["bidirectional_sequence_lstm_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -102,6 +105,7 @@ py_test(
     name = "bidirectional_sequence_rnn_test",
     size = "large",
     srcs = ["bidirectional_sequence_rnn_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
diff --git a/tensorflow/lite/experimental/kernels/BUILD b/tensorflow/lite/experimental/kernels/BUILD
index 78af889cf1e..bf4a007fb8c 100644
--- a/tensorflow/lite/experimental/kernels/BUILD
+++ b/tensorflow/lite/experimental/kernels/BUILD
@@ -35,7 +35,7 @@ cc_library(
 )
 
 cc_library(
-    name = "experimental_ops",
+    name = "ctc_beam_search_decoder_op",
     srcs = [
         "ctc_beam_search_decoder.cc",
     ],
@@ -66,7 +66,7 @@ cc_test(
     srcs = ["ctc_beam_search_decoder_test.cc"],
     tags = ["tflite_not_portable_ios"],
     deps = [
-        ":experimental_ops",
+        ":ctc_beam_search_decoder_op",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:test_util",
@@ -74,3 +74,54 @@ cc_test(
         "@flatbuffers",
     ],
 )
+
+cc_library(
+    name = "gru_cell",
+    srcs = ["gru_cell.cc"],
+    hdrs = ["gru_cell.h"],
+    deps = [
+        "//tensorflow/lite/kernels:cpu_backend_context",
+        "//tensorflow/lite/kernels/internal:optimized_base",
+        "//tensorflow/lite/kernels/internal:tensor",
+        "//third_party/eigen3",
+    ],
+)
+
+cc_library(
+    name = "unidirectional_sequence_gru_op",
+    srcs = [
+        "unidirectional_sequence_gru.cc",
+    ],
+    # Suppress warnings that are introduced by Eigen Tensor.
+    copts = tflite_copts() + [
+        "-Wno-error=reorder",
+    ] + select({
+        "//tensorflow:ios": ["-Wno-error=invalid-partial-specialization"],
+        "//conditions:default": [
+        ],
+    }),
+    deps = [
+        ":gru_cell",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:cpu_backend_context",
+        "//tensorflow/lite/kernels:cpu_backend_support",
+        "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/lite/kernels:op_macros",
+        "//tensorflow/lite/kernels/internal:tensor",
+        "@flatbuffers",
+    ],
+)
+
+cc_test(
+    name = "unidirectional_sequence_gru_test",
+    size = "small",
+    srcs = ["unidirectional_sequence_gru_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":unidirectional_sequence_gru_op",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/tensorflow/lite/experimental/kernels/gru_cell.cc b/tensorflow/lite/experimental/kernels/gru_cell.cc
new file mode 100644
index 00000000000..c21896ae83f
--- /dev/null
+++ b/tensorflow/lite/experimental/kernels/gru_cell.cc
@@ -0,0 +1,94 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/kernels/gru_cell.h"
+
+#include <vector>
+
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+
+namespace tflite {
+namespace ops {
+namespace experimental {
+namespace gru_cell {
+
+using optimized_ops::ArrayMap;
+using optimized_ops::FullyConnected;
+using optimized_ops::MapAsArrayWithLastDimAsRows;
+using reference_ops::Concatenation;
+
+void GruCell(const RuntimeShape& input_shape, const float* input,
+             const RuntimeShape& state_shape, const float* input_state,
+             const RuntimeShape& gate_weight_shape, const float* gate_weight,
+             const RuntimeShape& gate_bias_shape, const float* gate_bias,
+             const RuntimeShape& candidate_weight_shape,
+             const float* candidate_weight,
+             const RuntimeShape& candidate_bias_shape,
+             const float* candidate_bias, const RuntimeShape& output_shape,
+             float* output, float* output_state,
+             const RuntimeShape& activation_shape, float* activation,
+             const RuntimeShape& concat_shape, float* concat,
+             const tflite::FullyConnectedParams& fc_params,
+             tflite::CpuBackendContext* cpu_backend_context) {
+  const int n_batch = input_shape.Dims(0);
+  const int n_input = input_shape.Dims(1);
+  const int n_output = state_shape.Dims(1);
+
+  // [x h] = concat(input, state)
+  std::vector<float const*> concat_arrays_data;
+  std::vector<RuntimeShape const*> concat_arrays_shapes;
+  concat_arrays_data.push_back(input);
+  concat_arrays_data.push_back(input_state);
+  concat_arrays_shapes.push_back(&input_shape);
+  concat_arrays_shapes.push_back(&state_shape);
+  tflite::ConcatenationParams concat_params;
+  concat_params.axis = 1;
+  concat_params.inputs_count = concat_arrays_data.size();
+  Concatenation(concat_params, &(concat_arrays_shapes[0]),
+                &(concat_arrays_data[0]), concat_shape, concat);
+
+  // [r u] = [x h] * gate_weight + gate_bias
+  FullyConnected(fc_params, concat_shape, concat, gate_weight_shape,
+                 gate_weight, gate_bias_shape, gate_bias, activation_shape,
+                 activation, cpu_backend_context);
+
+  // [r u] = sigmoid([r u])
+  auto ru = MapAsArrayWithLastDimAsRows(activation, activation_shape);
+  ru = ru.unaryExpr(Eigen::internal::scalar_logistic_op<float>());
+  auto r = ru.block(0 * n_output, 0, n_output, n_batch);
+  auto u = ru.block(1 * n_output, 0, n_output, n_batch);
+
+  // hr = h .* r
+  auto h = MapAsArrayWithLastDimAsRows(input_state, state_shape);
+  auto xh = MapAsArrayWithLastDimAsRows(concat, concat_shape);
+  auto hr = xh.block(n_input, 0, n_output, n_batch);
+  hr = h * r;
+
+  // c = [x hr] * candidate_weight + candidate_bias
+  FullyConnected(fc_params, concat_shape, concat, candidate_weight_shape,
+                 candidate_weight, candidate_bias_shape, candidate_bias,
+                 output_shape, output, cpu_backend_context);
+
+  auto c = MapAsArrayWithLastDimAsRows(output, output_shape);
+  // output = (1 - u) .* tanh(c) + u .* h
+  c = (1.0 - u) * c.tanh() + u * h;
+
+  memcpy(output_state, output, n_batch * n_output * sizeof(float));
+}
+
+}  // namespace gru_cell
+}  // namespace experimental
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/kernels/gru_cell.h b/tensorflow/lite/experimental/kernels/gru_cell.h
new file mode 100644
index 00000000000..cd7b02e2a69
--- /dev/null
+++ b/tensorflow/lite/experimental/kernels/gru_cell.h
@@ -0,0 +1,46 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_KERNELS_GRU_CELL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_KERNELS_GRU_CELL_H_
+
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+
+namespace tflite {
+namespace ops {
+namespace experimental {
+namespace gru_cell {
+
+void GruCell(const RuntimeShape& input_shape, const float* input,
+             const RuntimeShape& state_shape, const float* input_state,
+             const RuntimeShape& gate_weight_shape, const float* gate_weight,
+             const RuntimeShape& gate_bias_shape, const float* gate_bias,
+             const RuntimeShape& candidate_weight_shape,
+             const float* candidate_weight,
+             const RuntimeShape& candidate_bias_shape,
+             const float* candidate_bias, const RuntimeShape& output_shape,
+             float* output, float* output_state,
+             const RuntimeShape& activation_shape, float* activation,
+             const RuntimeShape& concat_shape, float* concat,
+             const tflite::FullyConnectedParams& fc_params,
+             tflite::CpuBackendContext* cpu_backend_context);
+
+}  // namespace gru_cell
+}  // namespace experimental
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_KERNELS_GRU_CELL_H_
diff --git a/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru.cc b/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru.cc
new file mode 100644
index 00000000000..fc0d681f3bc
--- /dev/null
+++ b/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru.cc
@@ -0,0 +1,250 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <limits>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/kernels/gru_cell.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_support.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace experimental {
+namespace unidirectional_sequence_gru {
+namespace {
+
+void GruImpl(const TfLiteTensor* input, const TfLiteTensor* input_state,
+             const TfLiteTensor* gate_weight, const TfLiteTensor* gate_bias,
+             const TfLiteTensor* candidate_weight,
+             const TfLiteTensor* candidate_bias, TfLiteTensor* output,
+             TfLiteTensor* output_state, TfLiteTensor* activation,
+             TfLiteTensor* concat,
+             tflite::CpuBackendContext* cpu_backend_context) {
+  const int n_time = input->dims->data[0];
+  const int n_batch = input->dims->data[1];
+  const int n_input = input->dims->data[2];
+  const int n_output = output->dims->data[2];
+  const int n_batch_input = n_batch * n_input;
+  const int n_batch_output = n_batch * n_output;
+  const RuntimeShape input_shape({n_batch, n_input});
+  const float* input_data = GetTensorData<float>(input);
+  const RuntimeShape state_shape = GetTensorShape(input_state);
+  const float* input_state_data = GetTensorData<float>(input_state);
+  const RuntimeShape gate_weight_shape = GetTensorShape(gate_weight);
+  const float* gate_weight_data = GetTensorData<float>(gate_weight);
+  const RuntimeShape gate_bias_shape = GetTensorShape(gate_bias);
+  const float* gate_bias_data = GetTensorData<float>(gate_bias);
+  const RuntimeShape candidate_weight_shape = GetTensorShape(candidate_weight);
+  const float* candidate_weight_data = GetTensorData<float>(candidate_weight);
+  const RuntimeShape candidate_bias_shape = GetTensorShape(candidate_bias);
+  const float* candidate_bias_data = GetTensorData<float>(candidate_bias);
+  const RuntimeShape activation_shape = GetTensorShape(activation);
+  const RuntimeShape output_shape = RuntimeShape({n_batch, n_output});
+  float* output_data = GetTensorData<float>(output);
+  float* output_state_data = GetTensorData<float>(output_state);
+  float* activation_data = GetTensorData<float>(activation);
+  const RuntimeShape concat_shape = GetTensorShape(concat);
+  float* concat_data = GetTensorData<float>(concat);
+  tflite::FullyConnectedParams fc_params;
+  fc_params.float_activation_min = std::numeric_limits<float>::lowest();
+  fc_params.float_activation_max = std::numeric_limits<float>::max();
+  for (int i = 0; i < n_time; ++i) {
+    gru_cell::GruCell(
+        input_shape, input_data, state_shape, input_state_data,
+        gate_weight_shape, gate_weight_data, gate_bias_shape, gate_bias_data,
+        candidate_weight_shape, candidate_weight_data, candidate_bias_shape,
+        candidate_bias_data, output_shape, output_data, output_state_data,
+        activation_shape, activation_data, concat_shape, concat_data, fc_params,
+        cpu_backend_context);
+    input_data += n_batch_input;
+    output_data += n_batch_output;
+    input_state_data = output_state_data;
+  }
+}
+
+}  // namespace
+
+enum InputTensor {
+  // Input tensor of size [n_time, n_batch, n_input]
+  kInput = 0,
+  // Input state tensor of size [n_batch, n_output]
+  kInputState = 1,
+  // Gate weight tensor of size [2*n_output, n_input+n_output]
+  kGateWeight = 2,
+  // Gate bias tensor of size [2*n_output]
+  kGateBias = 3,
+  // Candidate weight tensor of size [n_output, n_input+n_output]
+  kCandidateWeight = 4,
+  // Candidate bias tensor of size [n_output]
+  kCandidateBias = 5,
+  kInputNum = 6
+};
+
+enum OutputTensor {
+  // Input tensor of size [n_time, n_batch, n_output]
+  kOutput = 0,
+  // Output state tensor of size [n_batch, n_output]
+  kOutputState = 1,
+  kOutputNum = 2
+};
+
+enum TemporaryTensor {
+  // Scratch buffer for activation of size [n_batch, 2*n_output]
+  kActivation = 0,
+  // Scratch buffer for activation of size [n_batch, n_input+n_output]
+  kConcat = 1,
+  kTemporaryNum = 2
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  cpu_backend_support::IncrementUsageCounter(context);
+  auto* scratch_tensor_index = new int;
+  context->AddTensors(context, kTemporaryNum, scratch_tensor_index);
+  return scratch_tensor_index;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  cpu_backend_support::DecrementUsageCounter(context);
+  delete reinterpret_cast<int*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
+
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, kInputNum);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, kOutputNum);
+
+  // input's dim = [n_time, n_batch, n_input]
+  const TfLiteTensor* input = GetInput(context, node, kInput);
+  TF_LITE_ENSURE_EQ(context, input->dims->size, 3);
+  const int n_time = input->dims->data[0];
+  const int n_batch = input->dims->data[1];
+  const int n_input = input->dims->data[2];
+
+  // input_state's dim = [n_batch, n_output]
+  const TfLiteTensor* input_state = GetInput(context, node, kInputState);
+  TF_LITE_ENSURE_EQ(context, input_state->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, input_state->dims->data[0], n_batch);
+  const int n_output = input_state->dims->data[1];
+
+  // gate_weight' dim = [2 * n_output, n_input + n_output]
+  const TfLiteTensor* gate_weight = GetInput(context, node, kGateWeight);
+  TF_LITE_ENSURE_EQ(context, gate_weight->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, gate_weight->dims->data[0], 2 * n_output);
+  TF_LITE_ENSURE_EQ(context, gate_weight->dims->data[1], n_input + n_output);
+
+  // gate_bias' dim = [2 * n_output]
+  const TfLiteTensor* gate_bias = GetInput(context, node, kGateBias);
+  TF_LITE_ENSURE_EQ(context, gate_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, gate_bias->dims->data[0], 2 * n_output);
+
+  // candidate_weight' dim = [n_output, n_input + n_output]
+  const TfLiteTensor* candidate_weight =
+      GetInput(context, node, kCandidateWeight);
+  TF_LITE_ENSURE_EQ(context, candidate_weight->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, candidate_weight->dims->data[0], n_output);
+  TF_LITE_ENSURE_EQ(context, candidate_weight->dims->data[1],
+                    n_input + n_output);
+
+  // candidate_bias' dim = [n_output]
+  const TfLiteTensor* candidate_bias = GetInput(context, node, kCandidateBias);
+  TF_LITE_ENSURE_EQ(context, candidate_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, candidate_bias->dims->data[0], n_output);
+
+  // output's dim = [n_time, n_batch, n_output]
+  TfLiteTensor* output = GetOutput(context, node, kOutput);
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(3);
+  output_size->data[0] = n_time;
+  output_size->data[1] = n_batch;
+  output_size->data[2] = n_output;
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, output, output_size));
+
+  // output_state's dim = [n_batch, n_output]
+  TfLiteTensor* output_state = GetOutput(context, node, kOutputState);
+  TF_LITE_ENSURE_OK(
+      context, context->ResizeTensor(context, output_state,
+                                     TfLiteIntArrayCopy(input_state->dims)));
+
+  TfLiteIntArrayFree(node->temporaries);
+  node->temporaries = TfLiteIntArrayCreate(kTemporaryNum);
+
+  // activation's dim = [n_batch, 2 * n_output]
+  node->temporaries->data[kActivation] = *scratch_tensor_index;
+  TfLiteTensor* activation = GetTemporary(context, node, kActivation);
+  activation->type = input->type;
+  activation->allocation_type = kTfLiteArenaRw;
+  TfLiteIntArray* activation_size = TfLiteIntArrayCreate(2);
+  activation_size->data[0] = n_batch;
+  activation_size->data[1] = 2 * n_output;
+  TF_LITE_ENSURE_OK(
+      context, context->ResizeTensor(context, activation, activation_size));
+
+  // concat's dim  = [n_batch, n_input + n_output]
+  node->temporaries->data[kConcat] = (*scratch_tensor_index) + kConcat;
+  TfLiteTensor* concat = GetTemporary(context, node, kConcat);
+  concat->type = input->type;
+  concat->allocation_type = kTfLiteArenaRw;
+  TfLiteIntArray* concat_size = TfLiteIntArrayCreate(2);
+  concat_size->data[0] = n_batch;
+  concat_size->data[1] = n_input + n_output;
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, concat, concat_size));
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInput);
+  const TfLiteTensor* input_state = GetInput(context, node, kInputState);
+  const TfLiteTensor* gate_weight = GetInput(context, node, kGateWeight);
+  const TfLiteTensor* gate_bias = GetInput(context, node, kGateBias);
+  const TfLiteTensor* candidate_weight =
+      GetInput(context, node, kCandidateWeight);
+  const TfLiteTensor* candidate_bias = GetInput(context, node, kCandidateBias);
+  TfLiteTensor* output = GetOutput(context, node, kOutput);
+  TfLiteTensor* output_state = GetOutput(context, node, kOutputState);
+  TfLiteTensor* activation = GetTemporary(context, node, kActivation);
+  TfLiteTensor* concat = GetTemporary(context, node, kConcat);
+  auto cpu_backend_context = cpu_backend_support::GetFromContext(context);
+
+  if (gate_weight->type == kTfLiteFloat32) {
+    GruImpl(input, input_state, gate_weight, gate_bias, candidate_weight,
+            candidate_bias, output, output_state, activation, concat,
+            cpu_backend_context);
+  } else {
+    context->ReportError(context,
+                         "Unsupported combination of data types for GruCell");
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace unidirectional_sequence_gru
+
+TfLiteRegistration* Register_UNIDIRECTIONAL_SEQUENCE_GRU() {
+  static TfLiteRegistration r = {
+      unidirectional_sequence_gru::Init, unidirectional_sequence_gru::Free,
+      unidirectional_sequence_gru::Prepare, unidirectional_sequence_gru::Eval};
+  return &r;
+}
+
+}  // namespace experimental
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru_test.cc b/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru_test.cc
new file mode 100644
index 00000000000..593d714e557
--- /dev/null
+++ b/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru_test.cc
@@ -0,0 +1,150 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/kernels/test_util.h"
+
+namespace tflite {
+namespace ops {
+namespace experimental {
+
+TfLiteRegistration* Register_UNIDIRECTIONAL_SEQUENCE_GRU();
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+
+class GRUOpModel : public SingleOpModel {
+ public:
+  explicit GRUOpModel(int n_batch, int n_input, int n_output,
+                      const std::vector<std::vector<int>>& input_shapes,
+                      const TensorType& weight_type = TensorType_FLOAT32)
+      : n_batch_(n_batch), n_input_(n_input), n_output_(n_output) {
+    input_ = AddInput(TensorType_FLOAT32);
+    input_state_ =
+        AddInput(TensorData{TensorType_FLOAT32, {n_batch, n_output}}, true);
+    gate_weight_ = AddInput(TensorType_FLOAT32);
+    gate_bias_ = AddInput(TensorType_FLOAT32);
+    candidate_weight_ = AddInput(TensorType_FLOAT32);
+    candidate_bias_ = AddInput(TensorType_FLOAT32);
+
+    output_ = AddOutput(TensorType_FLOAT32);
+    output_state_ = AddOutput(TensorType_FLOAT32);
+
+    SetCustomOp("UNIDIRECTIONAL_SEQUENCE_GRU", {},
+                Register_UNIDIRECTIONAL_SEQUENCE_GRU);
+    BuildInterpreter(input_shapes);
+  }
+
+  void SetInput(const std::vector<float>& f) { PopulateTensor(input_, f); }
+
+  void SetInputState(const std::vector<float>& f) {
+    PopulateTensor(input_state_, f);
+  }
+
+  void SetGateWeight(const std::vector<float>& f) {
+    PopulateTensor(gate_weight_, f);
+  }
+
+  void SetGateBias(const std::vector<float>& f) {
+    PopulateTensor(gate_bias_, f);
+  }
+
+  void SetCandidateWeight(const std::vector<float>& f) {
+    PopulateTensor(candidate_weight_, f);
+  }
+
+  void SetCandidateBias(const std::vector<float>& f) {
+    PopulateTensor(candidate_bias_, f);
+  }
+
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+  int num_batches() { return n_batch_; }
+  int num_inputs() { return n_input_; }
+  int num_outputs() { return n_output_; }
+
+ private:
+  int input_;
+  int input_state_;
+  int gate_weight_;
+  int gate_bias_;
+  int candidate_weight_;
+  int candidate_bias_;
+
+  int output_;
+  int output_state_;
+  int n_batch_;
+  int n_input_;
+  int n_output_;
+};
+
+TEST(GRUTest, SimpleTest) {
+  const int n_time = 2;
+  const int n_batch = 2;
+  const int n_input = 2;
+  const int n_output = 3;
+
+  GRUOpModel m(n_batch, n_input, n_output,
+               {{n_time, n_batch, n_input},
+                {n_batch, n_output},
+                {2 * n_output, n_input + n_output},
+                {2 * n_output},
+                {n_output, n_input + n_output},
+                {n_output}});
+  // All data is randomly generated.
+  m.SetInput({0.89495724, 0.34482682, 0.68505806, 0.7135783, 0.3167085,
+              0.93647677, 0.47361764, 0.39643127});
+  m.SetInputState(
+      {0.09992421, 0.3028481, 0.78305984, 0.50438094, 0.11269058, 0.10244724});
+  m.SetGateWeight({0.7256918,  0.8945897,  0.03285786, 0.42637166, 0.119376324,
+                   0.83035135, 0.16997327, 0.42302176, 0.77598256, 0.2660894,
+                   0.9587266,  0.6218451,  0.88164485, 0.12272458, 0.2699055,
+                   0.18399088, 0.21930052, 0.3374841,  0.70866305, 0.9523419,
+                   0.25170696, 0.60988617, 0.79823977, 0.64477515, 0.2602957,
+                   0.5053131,  0.93722224, 0.8451359,  0.97905475, 0.38669217});
+  m.SetGateBias(
+      {0.032708533, 0.018445263, 0.15320699, 0.8163046, 0.26683575, 0.1412022});
+  m.SetCandidateWeight({0.96165305, 0.95572084, 0.11534478, 0.96965164,
+                        0.33562955, 0.8680755, 0.003066936, 0.057793964,
+                        0.8671354, 0.33354893, 0.7313398, 0.78492093,
+                        0.19530584, 0.116550304, 0.13599132});
+  m.SetCandidateBias({0.89837056, 0.54769796, 0.63364106});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutputShape(), ElementsAre(n_time, n_batch, n_output));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {0.20112592, 0.45286041, 0.80842507, 0.59567153, 0.2619998,
+                   0.22922856, 0.27715868, 0.5247152, 0.82300174, 0.65812796,
+                   0.38217607, 0.3401444})));
+}
+
+}  // namespace
+}  // namespace experimental
+}  // namespace ops
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/experimental/microfrontend/lib/frontend_memmap_generator.c b/tensorflow/lite/experimental/microfrontend/lib/frontend_memmap_generator.c
index 766b7f2ad56..d62433a541c 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/frontend_memmap_generator.c
+++ b/tensorflow/lite/experimental/microfrontend/lib/frontend_memmap_generator.c
@@ -22,7 +22,8 @@ int main(int argc, char** argv) {
   if (argc != 3) {
     fprintf(stderr,
             "%s requires exactly two parameters - the names of the header and "
-            "source files to save\n");
+            "source files to save\n",
+            argv[0]);
     return 1;
   }
   struct FrontendConfig frontend_config;
diff --git a/tensorflow/lite/experimental/objc/BUILD.apple b/tensorflow/lite/experimental/objc/BUILD.apple
index b118ae48860..e5f18059676 100644
--- a/tensorflow/lite/experimental/objc/BUILD.apple
+++ b/tensorflow/lite/experimental/objc/BUILD.apple
@@ -68,7 +68,7 @@ objc_library(
 
 ios_unit_test(
     name = "TensorFlowLiteTests",
-    size = "small",
+    size = "medium",
     minimum_os_version = TFL_MINIMUM_OS_VERSION,
     tags = TFL_DEFAULT_TAGS + TFL_DISABLED_SANITIZER_TAGS,
     deps = [
diff --git a/tensorflow/lite/experimental/objc/README.md b/tensorflow/lite/experimental/objc/README.md
index f735d413653..e0788e61c62 100644
--- a/tensorflow/lite/experimental/objc/README.md
+++ b/tensorflow/lite/experimental/objc/README.md
@@ -5,7 +5,7 @@ solution for Objective-C developers. It enables low-latency inference of
 on-device machine learning models with a small binary size and fast performance
 supporting hardware acceleration.
 
-## Getting Started
+## Build TensorFlow with iOS support
 
 To build the Objective-C TensorFlow Lite library on Apple platforms,
 [install from source](https://www.tensorflow.org/install/source#setup_for_linux_and_macos)
@@ -19,9 +19,34 @@ python configure.py
 
 Follow the prompts and when asked to build TensorFlow with iOS support, enter `y`.
 
-### Bazel
+### CocoaPods developers
 
-In your `BUILD` file, add the `TensorFlowLite` dependency:
+Add the TensorFlow Lite pod to your `Podfile`:
+
+```ruby
+pod 'TensorFlowLiteObjC'
+```
+
+Then, run `pod install`.
+
+In your Objective-C files, import the umbrella header:
+
+```objectivec
+#import "TFLTensorFlowLite.h"
+```
+
+Or, the module if you set `CLANG_ENABLE_MODULES = YES` in your Xcode project:
+
+```objectivec
+@import TFLTensorFlowLite;
+```
+
+Note: To import the TensorFlow Lite module in your Objective-C files, you must
+also include `use_frameworks!` in your `Podfile`.
+
+### Bazel developers
+
+In your `BUILD` file, add the `TensorFlowLite` dependency to your target:
 
 ```python
 objc_library(
@@ -37,6 +62,12 @@ In your Objective-C files, import the umbrella header:
 #import "TFLTensorFlowLite.h"
 ```
 
+Or, the module if you set `CLANG_ENABLE_MODULES = YES` in your Xcode project:
+
+```objectivec
+@import TFLTensorFlowLite;
+```
+
 Build the `TensorFlowLite` Objective-C library target:
 
 ```shell
@@ -49,36 +80,14 @@ Build the `TensorFlowLiteTests` target:
 bazel test tensorflow/lite/experimental/objc:TensorFlowLiteTests
 ```
 
-### Tulsi
+#### Generate the Xcode project using Tulsi
 
-Open the `TensorFlowLite.tulsiproj` using the
-[TulsiApp](https://github.com/bazelbuild/tulsi) or by running the
+Open the `//tensorflow/lite/experimental/objc/TensorFlowLite.tulsiproj` using
+the [TulsiApp](https://github.com/bazelbuild/tulsi)
+or by running the
 [`generate_xcodeproj.sh`](https://github.com/bazelbuild/tulsi/blob/master/src/tools/generate_xcodeproj.sh)
 script from the root `tensorflow` directory:
 
 ```shell
 generate_xcodeproj.sh --genconfig tensorflow/lite/experimental/objc/TensorFlowLite.tulsiproj:TensorFlowLite --outputfolder ~/path/to/generated/TensorFlowLite.xcodeproj
 ```
-
-### CocoaPods
-
-Add the following to your `Podfile`:
-
-```ruby
-pod 'TensorFlowLiteObjC'
-```
-
-Then, run `pod install`.
-
-In your Objective-C files, import the umbrella header:
-
-```objectivec
-#import "TFLTensorFlowLite.h"
-```
-
-Or, the module if `CLANG_ENABLE_MODULES = YES` and `use_frameworks!` is
-specified in your `Podfile`:
-
-```objectivec
-@import TFLTensorFlowLite;
-```
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
index fe62e8ac8ee..8698df31b37 100644
--- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
@@ -1,5 +1,3 @@
-# Run `pod lib lint TensorFlowLiteObjC.podspec` to ensure this is a valid spec.
-
 Pod::Spec.new do |s|
   s.name             = 'TensorFlowLiteObjC'
   s.version          = '0.2.0'
diff --git a/tensorflow/lite/experimental/objc/apis/TFLTensor.h b/tensorflow/lite/experimental/objc/apis/TFLTensor.h
index dc710abf4e2..fd781bd5723 100644
--- a/tensorflow/lite/experimental/objc/apis/TFLTensor.h
+++ b/tensorflow/lite/experimental/objc/apis/TFLTensor.h
@@ -29,6 +29,9 @@ typedef NS_ENUM(NSUInteger, TFLTensorDataType) {
   /** 32-bit single precision floating point. */
   TFLTensorDataTypeFloat32,
 
+  /** 16-bit half precision floating point. */
+  TFLTensorDataTypeFloat16,
+
   /** 32-bit signed integer. */
   TFLTensorDataTypeInt32,
 
diff --git a/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm b/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
index cf5a6b4c92b..1c8b7f976ec 100644
--- a/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
+++ b/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
@@ -366,6 +366,8 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
   switch (cTensorType) {
     case kTfLiteFloat32:
       return TFLTensorDataTypeFloat32;
+    case kTfLiteFloat16:
+      return TFLTensorDataTypeFloat16;
     case kTfLiteInt32:
       return TFLTensorDataTypeInt32;
     case kTfLiteUInt8:
diff --git a/tensorflow/lite/experimental/ruy/BUILD b/tensorflow/lite/experimental/ruy/BUILD
index 68d60648ec6..be1ddea2663 100644
--- a/tensorflow/lite/experimental/ruy/BUILD
+++ b/tensorflow/lite/experimental/ruy/BUILD
@@ -265,11 +265,13 @@ cc_library(
     srcs = [
         "dispatch.h",
         "impl.h",
+        "prepack.h",
     ],
     hdrs = [
         "matrix.h",
         "path.h",
         "ruy.h",
+        "ruy_advanced.h",
     ],
     visibility = ruy_visibility(),
     deps = [
@@ -290,7 +292,7 @@ cc_library(
     ],
 )
 
-# Just a usage example.
+# Usage examples.
 cc_binary(
     name = "example",
     srcs = ["example.cc"],
@@ -299,12 +301,22 @@ cc_binary(
     ],
 )
 
+# Usage examples of the advanced API.
+cc_binary(
+    name = "example_advanced",
+    srcs = ["example_advanced.cc"],
+    deps = [
+        ":ruy",
+    ],
+)
+
 # Small library to query PMU counters, for benchmark only
 cc_library(
     name = "pmu",
     testonly = True,
     srcs = ["pmu.cc"],
     hdrs = ["pmu.h"],
+    deps = [":check_macros"],
 )
 
 load(":ruy_test_ext.bzl", "ruy_test_ext_defines", "ruy_test_ext_deps")
@@ -339,6 +351,7 @@ ruy_benchmark(
         ("i8", "i8", "i32", "u8"),
         ("i8", "i8", "i32", "i8"),
         ("u8", "u8", "i32", "i16"),
+        ("i8", "i8", "i32", "i32"),
     ],
 )
 
@@ -353,6 +366,7 @@ ruy_test(
         ("i8", "i8", "i32", "i8"),
         ("i8", "u8", "i32", "i8"),
         ("u8", "u8", "i32", "i16"),
+        ("i8", "i8", "i32", "i32"),
     ],
 )
 
@@ -364,7 +378,9 @@ ruy_test(
         ("u8", "u8", "i32", "u8"),
         ("i8", "i8", "i32", "i8"),
         ("u8", "u8", "i32", "i16"),
+        ("i8", "i8", "i32", "i32"),
     ],
+    tags = ["slow"],
 )
 
 ruy_test(
diff --git a/tensorflow/lite/experimental/ruy/allocator.h b/tensorflow/lite/experimental/ruy/allocator.h
index 5edf6930866..ef1db4da269 100644
--- a/tensorflow/lite/experimental/ruy/allocator.h
+++ b/tensorflow/lite/experimental/ruy/allocator.h
@@ -64,8 +64,9 @@ class AlignedAllocator {
   //    be queried cheaply, at runtime, from userspace, if needed.
   static constexpr std::size_t kAlignment = 64;
 
+  void operator=(const AlignedAllocator&) = delete;
   ~AlignedAllocator() {
-    RUY_DCHECK(fallback_blocks_.empty());
+    FreeAll();
     SystemAlignedFree(ptr_);
   }
 
diff --git a/tensorflow/lite/experimental/ruy/allocator_test.cc b/tensorflow/lite/experimental/ruy/allocator_test.cc
index 44848fa04ed..7006b0d1107 100644
--- a/tensorflow/lite/experimental/ruy/allocator_test.cc
+++ b/tensorflow/lite/experimental/ruy/allocator_test.cc
@@ -72,6 +72,30 @@ TEST(AllocatorTest, ManySmallAllocations) {
   }
 }
 
+TEST(AllocatorTest, DestructorHandlesMainBumpPtr) {
+  // This is a white-box test.
+  Allocator allocator;
+  allocator.AllocateBytes(1);
+  allocator.FreeAll();
+  // After the call to FreeAll, the allocator will consolidate all of the memory
+  // into the main bump-ptr allocator's block, which we then expect to be freed
+  // in the destructor.
+  //
+  // We have no test assertions -- we primarily expect that this trigger a leak
+  // checker and cause the test to fail.
+}
+
+TEST(AllocatorTest, DestructorHandlesFallbackBlocks) {
+  // This is a white-box test.
+  Allocator allocator;
+  // Since we just created the allocator, this will allocate a fallback block,
+  // which we then expect to be freed in the destructor.
+  //
+  // We have no test assertions -- we primarily expect that this trigger a leak
+  // checker and cause the test to fail.
+  allocator.AllocateBytes(1);
+}
+
 }  // namespace
 }  // namespace ruy
 
diff --git a/tensorflow/lite/experimental/ruy/benchmark.cc b/tensorflow/lite/experimental/ruy/benchmark.cc
index 55b02d24df9..ef7a109a1d0 100644
--- a/tensorflow/lite/experimental/ruy/benchmark.cc
+++ b/tensorflow/lite/experimental/ruy/benchmark.cc
@@ -36,7 +36,8 @@ struct BenchmarkShape {
 };
 
 template <typename TestSetType>
-std::vector<TestResult<DstScalar>> BenchmarkRCC(const BenchmarkShape& shape) {
+std::vector<std::unique_ptr<TestResult<DstScalar>>> BenchmarkRCC(
+    const BenchmarkShape& shape) {
   TestSetType test_set;
   test_set.rows = shape.rows;
   test_set.depth = shape.depth;
@@ -52,8 +53,10 @@ std::vector<TestResult<DstScalar>> BenchmarkRCC(const BenchmarkShape& shape) {
   test_set.rhs_zero_point = SymmetricZeroPoint<RhsScalar>() + asymmetry_rhs;
   test_set.use_specified_zero_points = true;
   test_set.perchannel = GetBoolEnvVarOrFalse("PERCHANNEL");
+  test_set.benchmark_prepack_lhs = GetBoolEnvVarOrFalse("PREPACK_LHS");
+  test_set.benchmark_prepack_rhs = GetBoolEnvVarOrFalse("PREPACK_RHS");
   test_set.Run();
-  return test_set.results;
+  return std::move(test_set.results);
 }
 
 void Benchmark() {
@@ -108,7 +111,7 @@ void Benchmark() {
       if (benchmark_cubic) {
         printf("size");
         for (const auto& result : results) {
-          printf(",%s", PathName(result).c_str());
+          printf(",%s", PathName(*result).c_str());
         }
         printf("\n");
       } else {
@@ -119,27 +122,28 @@ void Benchmark() {
     if (benchmark_cubic) {
       printf("%d", shape.rows);
       for (const auto& result : results) {
-        printf(",%.4g",
-               2.0e-9 * shape.rows * shape.cols * shape.depth / result.latency);
+        printf(",%.4g", 2.0e-9 * shape.rows * shape.cols * shape.depth /
+                            result->latency);
         if (getenv("RUY_BENCHMARK_PMU")) {
-          printf(",%.3g,%.3g,%.3g,%.3g,%.3g,%.3g", result.l1_refill_rate,
-                 result.l2_refill_rate, result.l3_refill_rate,
-                 result.mispred_rate, result.frontend_stall_rate,
-                 result.backend_stall_rate);
+          printf(",%.3g,%.3g,%.3g,%.3g,%.3g,%.3g", result->l1_refill_rate,
+                 result->l2_refill_rate, result->l3_refill_rate,
+                 result->mispred_rate, result->frontend_stall_rate,
+                 result->backend_stall_rate);
         }
       }
       printf("\n");
       fflush(stdout);
     } else {
       for (const auto& result : results) {
-        printf("%s,%dx%dx%d,%.4g", PathName(result).c_str(), shape.rows,
-               shape.depth, shape.cols,
-               2.0e-9 * shape.rows * shape.cols * shape.depth / result.latency);
+        printf(
+            "%s,%dx%dx%d,%.4g", PathName(*result).c_str(), shape.rows,
+            shape.depth, shape.cols,
+            2.0e-9 * shape.rows * shape.cols * shape.depth / result->latency);
         if (getenv("RUY_BENCHMARK_PMU")) {
-          printf(",%.3g,%.3g,%.3g,%.3g,%.3g,%.3g", result.l1_refill_rate,
-                 result.l2_refill_rate, result.l3_refill_rate,
-                 result.mispred_rate, result.frontend_stall_rate,
-                 result.backend_stall_rate);
+          printf(",%.3g,%.3g,%.3g,%.3g,%.3g,%.3g", result->l1_refill_rate,
+                 result->l2_refill_rate, result->l3_refill_rate,
+                 result->mispred_rate, result->frontend_stall_rate,
+                 result->backend_stall_rate);
         }
         printf("\n");
       }
diff --git a/tensorflow/lite/experimental/ruy/dispatch.h b/tensorflow/lite/experimental/ruy/dispatch.h
index 6520b2341f5..8a8e8a066b9 100644
--- a/tensorflow/lite/experimental/ruy/dispatch.h
+++ b/tensorflow/lite/experimental/ruy/dispatch.h
@@ -37,6 +37,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_DISPATCH_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_DISPATCH_H_
 
+#include <limits>
+
 #include "profiling/instrumentation.h"
 #include "tensorflow/lite/experimental/ruy/common.h"
 #include "tensorflow/lite/experimental/ruy/context.h"
@@ -95,6 +97,21 @@ void EnforceZeroPointSupport(LhsScalar lhs_zero_point, RhsScalar rhs_zero_point,
              rhs_zero_point != std::numeric_limits<RhsScalar>::lowest());
 }
 
+template <typename Spec, typename DstScalar>
+void EnforceDstSpecSupport(const Spec& spec, DstScalar dst_zero_point) {
+  if (!std::is_same<typename Spec::DstScalar, std::int32_t>::value) return;
+
+  // If user is looking for the raw accumulator, zero_point and all the other
+  // dequantize fields don't make sense and should not be set.
+  RUY_DCHECK(dst_zero_point == 0);
+  RUY_DCHECK(spec.clamp_max == std::numeric_limits<std::int32_t>::max());
+  RUY_DCHECK(spec.clamp_min == std::numeric_limits<std::int32_t>::min());
+  RUY_DCHECK(spec.multiplier_fixedpoint == 0);
+  RUY_DCHECK(spec.multiplier_exponent == 0);
+  RUY_DCHECK(spec.multiplier_fixedpoint_perchannel == nullptr);
+  RUY_DCHECK(spec.multiplier_exponent_perchannel == nullptr);
+}
+
 inline bool IsColMajorTrMul(const DMatrix& lhs, const DMatrix& rhs,
                             const DMatrix& dst) {
   return IsColMajor(lhs.layout) && IsColMajor(rhs.layout) &&
@@ -141,16 +158,23 @@ template <Path ThePath, typename LhsScalar, typename RhsScalar,
 void PopulateTrMulParams(TrMulParams* params) {
   static_assert((ThePath & Path::kReference) == Path::kNone,
                 "Path::kReference should not do TrMul");
-  // The optimized code paths only handle a very specific set of layouts.
-  // Fall back to Path::kStandardCpp if needed.
+  // The optimized code paths don't handle the full generality of Ruy's API.
+  // Fall back to Path::kStandardCpp if necessary.
+  bool fallback_to_standard_cpp = false;
   if (ThePath != Path::kStandardCpp) {
+    // The optimized code paths currently only handle the case of all matrices
+    // being column major.
     if (!IsColMajorTrMul(params->lhs, params->rhs, params->dst)) {
-      PopulateTrMulParams<Path::kStandardCpp, LhsScalar, RhsScalar, DstScalar,
-                          Spec>(params);
-      return;
+      fallback_to_standard_cpp = true;
     }
   }
 
+  if (fallback_to_standard_cpp) {
+    PopulateTrMulParams<Path::kStandardCpp, LhsScalar, RhsScalar, DstScalar,
+                        Spec>(params);
+    return;
+  }
+
   using PackedLhsScalar = PackedType<ThePath, LhsScalar>;
   using PackedRhsScalar = PackedType<ThePath, RhsScalar>;
   using Kernel =
@@ -354,6 +378,7 @@ void DispatchMul(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
   EnforceLayoutSupport<Spec>(lhs.layout, rhs.layout, dst->layout);
   EnforceZeroPointSupport<Spec>(lhs.zero_point, rhs.zero_point,
                                 dst->zero_point);
+  EnforceDstSpecSupport<Spec>(spec, dst->zero_point);
 
   // This should be a constant, for a given machine and CompiledPaths.
   // There is a back door to override it for testing, but in production it will
diff --git a/tensorflow/lite/experimental/ruy/example_advanced.cc b/tensorflow/lite/experimental/ruy/example_advanced.cc
new file mode 100644
index 00000000000..802c85c85a7
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/example_advanced.cc
@@ -0,0 +1,80 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <iostream>
+
+#include "tensorflow/lite/experimental/ruy/ruy_advanced.h"
+
+// Simple allocator for allocating pre-packed matrices.
+class SimpleAllocator {
+ public:
+  void* AllocateBytes(std::size_t num_bytes) {
+    char* p = new char[num_bytes];
+    buffers_.emplace_back(p);
+    return static_cast<void*>(p);
+  }
+
+ private:
+  std::vector<std::unique_ptr<char[]>> buffers_;
+};
+
+void ExamplePrepack(ruy::Context* context) {
+  const float lhs_data[] = {1, 2, 3, 4};
+  const float rhs_data[] = {1, 2, 3, 4};
+  float dst_data[4];
+
+  // Set up the matrix layouts and spec.
+  ruy::Matrix<float> lhs;
+  ruy::MakeSimpleLayout(2, 2, ruy::Order::kRowMajor, &lhs.layout);
+  ruy::Matrix<float> rhs;
+  ruy::MakeSimpleLayout(2, 2, ruy::Order::kColMajor, &rhs.layout);
+  ruy::Matrix<float> dst;
+  ruy::MakeSimpleLayout(2, 2, ruy::Order::kColMajor, &dst.layout);
+  ruy::BasicSpec<float, float> spec;
+
+  SimpleAllocator allocator;
+  auto alloc_fn = [&allocator](std::size_t num_bytes) -> void* {
+    return allocator.AllocateBytes(num_bytes);
+  };
+
+  // In this example, we pre-pack only the RHS, but either will work.
+  // Note that we only need to set the data pointer for the matrix we are
+  // pre-packing.
+  ruy::PrepackedMatrix prepacked_rhs;
+  rhs.data = rhs_data;
+  ruy::PrePackForMul<ruy::kAllPaths>(lhs, rhs, spec, context, &dst,
+                                     /*prepacked_lhs=*/nullptr, &prepacked_rhs,
+                                     alloc_fn);
+
+  // No data will be read from the RHS input matrix when using a pre-packed RHS.
+  rhs.data = nullptr;
+  lhs.data = lhs_data;
+  dst.data = dst_data;
+  ruy::MulWithPrepacked<ruy::kAllPaths>(lhs, rhs, spec, context, &dst,
+                                        /*prepacked_lhs=*/nullptr,
+                                        &prepacked_rhs);
+  rhs.data = rhs_data;
+
+  // Print out the results.
+  std::cout << "Example Mul with pre-packing RHS, float:\n";
+  std::cout << "LHS:\n" << lhs;
+  std::cout << "RHS:\n" << rhs;
+  std::cout << "Result:\n" << dst << "\n";
+}
+
+int main() {
+  ruy::Context context;
+  ExamplePrepack(&context);
+}
diff --git a/tensorflow/lite/experimental/ruy/impl.h b/tensorflow/lite/experimental/ruy/impl.h
index deadf528e62..11fb156f92c 100644
--- a/tensorflow/lite/experimental/ruy/impl.h
+++ b/tensorflow/lite/experimental/ruy/impl.h
@@ -59,6 +59,8 @@ struct TrMulParams {
   DMatrix dst;
   PMatrix packed_lhs;
   PMatrix packed_rhs;
+  bool lhs_is_prepacked = false;
+  bool rhs_is_prepacked = false;
 
   // Type-erased Spec.
   void* spec = nullptr;
@@ -239,14 +241,23 @@ inline void TrMul(TrMulParams* params, Context* context) {
   const auto loop_structure = GetLoopStructure(thread_count, rows, cols, depth);
   const Tuning tuning = GetTuning(context);
   Allocator* allocator = context->GetMainAllocator();
-  AllocatePMatrix(allocator, &packed_lhs);
-  AllocatePMatrix(allocator, &packed_rhs);
+
+  if (!params->lhs_is_prepacked) {
+    AllocatePMatrix(allocator, &packed_lhs);
+  }
+  if (!params->rhs_is_prepacked) {
+    AllocatePMatrix(allocator, &packed_rhs);
+  }
 
   if (loop_structure == LoopStructure::kSimple) {
     gemmlowp::ScopedProfilingLabel label_simple("TrMulImpl, simple loop");
 
-    params->LhsRunPack(tuning, 0, rows_rounded_up);
-    params->RhsRunPack(tuning, 0, cols_rounded_up);
+    if (!params->lhs_is_prepacked) {
+      params->LhsRunPack(tuning, 0, rows_rounded_up);
+    }
+    if (!params->rhs_is_prepacked) {
+      params->RhsRunPack(tuning, 0, cols_rounded_up);
+    }
     params->RunKernel(tuning, 0, 0, rows_rounded_up, cols_rounded_up);
 
     allocator->FreeAll();
@@ -277,21 +288,29 @@ inline void TrMul(TrMulParams* params, Context* context) {
   }
 
   // Allocate memory.
-  std::atomic<bool>* lhs_packed;
-  allocator->Allocate(num_blocks_of_rows, &lhs_packed);
-  std::atomic<bool>* rhs_packed;
-  allocator->Allocate(num_blocks_of_cols, &rhs_packed);
+  std::atomic<bool>* lhs_packed = nullptr;
+  if (!params->lhs_is_prepacked) {
+    allocator->Allocate(num_blocks_of_rows, &lhs_packed);
+  }
+  std::atomic<bool>* rhs_packed = nullptr;
+  if (!params->rhs_is_prepacked) {
+    allocator->Allocate(num_blocks_of_cols, &rhs_packed);
+  }
   std::atomic<std::uint32_t>* atomic_n;
   allocator->Allocate(1, &atomic_n);
   TrMulTask* tasks;
   allocator->Allocate(thread_count, &tasks);
 
   // Initialize allocated data.
-  for (int i = 0; i < num_blocks_of_rows; i++) {
-    lhs_packed[i].store(false, std::memory_order_release);
+  if (lhs_packed != nullptr) {
+    for (int i = 0; i < num_blocks_of_rows; i++) {
+      lhs_packed[i].store(false, std::memory_order_release);
+    }
   }
-  for (int i = 0; i < num_blocks_of_cols; i++) {
-    rhs_packed[i].store(false, std::memory_order_release);
+  if (rhs_packed != nullptr) {
+    for (int i = 0; i < num_blocks_of_cols; i++) {
+      rhs_packed[i].store(false, std::memory_order_release);
+    }
   }
   atomic_n->store(thread_count);
 
diff --git a/tensorflow/lite/experimental/ruy/internal_matrix.h b/tensorflow/lite/experimental/ruy/internal_matrix.h
index 9a7d6ee6938..55deef0d318 100644
--- a/tensorflow/lite/experimental/ruy/internal_matrix.h
+++ b/tensorflow/lite/experimental/ruy/internal_matrix.h
@@ -21,8 +21,9 @@ limitations under the License.
 // TODO(silvasean): Put parts of this architecture description somewhere more
 // prominent.
 //
-// The 4 different matrix types are:
-// - Matrix<T>: This is a user-facing type on Ruy's external API boundary.
+// The 4 main matrix types are:
+// - Matrix<T>: This is a user-facing type on Ruy's external API boundary. It is
+// also used internally.
 // - DMatrix: This is a type-erased version of Matrix<T>. "D" = "dynamic".
 // - PMatrix: This represents a packed matrix, which requires tracking kernel
 // layout and row/column sums for quantization. It is type-erased.
@@ -71,10 +72,20 @@ limitations under the License.
 //
 // To present another structured view of our various matrix types, here's a
 // table:
-//                User matrices    Packed matrices
+//                Plain matrices   Packed matrices
 //             +----------------------------------
 // Templated   |  Matrix<T>        PackedMatrix<T>
 // Type-erased |  DMatrix          PMatrix
+//
+//
+// There is 1 additional matrix type not mentioned above, due to its low
+// importance:
+// - PrepackedMatrix: This is a user-facing version of PMatrix. It has the bare
+// minimum of fields needed for representing the raw data and sums buffers of a
+// packed matrix for the "advanced" explicit pre-packing API. This type plays no
+// role in Ruy's internals and can generally by ignored. The only reason it
+// exists is so that PMatrix is not exposed to users -- we prefer to keep the
+// internal matrix types hidden, even from "advanced" users.
 
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_INTERNAL_MATRIX_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_INTERNAL_MATRIX_H_
diff --git a/tensorflow/lite/experimental/ruy/kernel.cc b/tensorflow/lite/experimental/ruy/kernel.cc
index 3e47d87c84c..b3082473a17 100644
--- a/tensorflow/lite/experimental/ruy/kernel.cc
+++ b/tensorflow/lite/experimental/ruy/kernel.cc
@@ -24,6 +24,7 @@ namespace ruy {
 #define RUY_ASM_LABEL_STORE_UINT8 91
 #define RUY_ASM_LABEL_STORE_INT8 92
 #define RUY_ASM_LABEL_STORE_INT16 93
+#define RUY_ASM_LABEL_STORE_INT32 94
 #define RUY_ASM_LABEL_AFTER_STORE 99
 
 #define RUY_OFFSET_BIAS 0
@@ -49,8 +50,8 @@ namespace ruy {
 #define RUY_OFFSET_DST_STRIDE 112
 #define RUY_OFFSET_DEPTH 116
 #define RUY_OFFSET_CLAMP_MIN 120
-#define RUY_OFFSET_CLAMP_MAX 122
-#define RUY_OFFSET_FLAGS 124
+#define RUY_OFFSET_CLAMP_MAX 124
+#define RUY_OFFSET_FLAGS 128
 
 template <typename Params>
 void CheckOffsetsInKernelParams8bit(const Params&) {
@@ -476,6 +477,12 @@ void Kernel8bitNeonOutOfOrder(const KernelParams8bit<4, 4>& params) {
         "sub v17.4s, v17.4s, v11.4s\n"
         "sub v18.4s, v18.4s, v11.4s\n"
         "sub v19.4s, v19.4s, v11.4s\n"
+
+        // If the destination is int32, it means the user asks for the raw
+        // accumulators, no need for us to downquantize the value.
+        "cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT32) "\n"
+        "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT32) "f\n"
+
         "402:\n"
 
         // At this point we have computed the final int32 values. Now we
@@ -924,6 +931,108 @@ void Kernel8bitNeonOutOfOrder(const KernelParams8bit<4, 4>& params) {
         RUY_MAKE_ZERO(v16)
         RUY_MAKE_ZERO(v17)
 
+        "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
+
+        RUY_STR(RUY_ASM_LABEL_STORE_INT32) ":\n"
+
+        // Since the store type is the same as the accum type, no need for
+        // downcast. There's also no need for clamp by min/max.
+
+        // At this point, v20 -- v31 aren't used anymore for the current block,
+        // so we can start clearing these accumulators for the next block
+        // (next iteration of the main loop).
+        RUY_MAKE_ZERO(v20)
+        RUY_MAKE_ZERO(v21)
+        RUY_MAKE_ZERO(v22)
+        RUY_MAKE_ZERO(v23)
+        RUY_MAKE_ZERO(v24)
+        RUY_MAKE_ZERO(v25)
+        RUY_MAKE_ZERO(v26)
+        RUY_MAKE_ZERO(v27)
+        RUY_MAKE_ZERO(v28)
+        RUY_MAKE_ZERO(v29)
+        RUY_MAKE_ZERO(v30)
+        RUY_MAKE_ZERO(v31)
+
+        // Compute how much of the 4x4 block of destination 8bit values that
+        // we have computed, fit in the destination matrix. Typically, all of
+        // it fits, but when the destination matrix shape is not a multiple
+        // of 4x4, there are some 4x4 blocks along the boundaries that do
+        // not fit entirely.
+        "sub w1, %w[dst_rows], %w[row]\n"
+        "sub w2, %w[dst_cols], %w[col]\n"
+        "mov w3, #4\n"
+        "cmp w1, #4\n"
+        // Compute w1 = how many rows of the 4x4 block fit
+        "csel w1, w1, w3, le\n"
+        "cmp w2, #4\n"
+        // Compute w2 = how many cols of the 4x4 block fit
+        "csel w2, w2, w3, le\n"
+
+        // Test if w1==4 && w2 == 4, i.e. if all of the 8x8 block fits.
+        "cmp w1, w3\n"
+        "ccmp w2, w3, 0, eq\n"
+        "mov x4, %[dst_ptr]\n"
+        // Yes, all of the 4x4 block fits, go to fast path.
+        "beq 30f\n"
+        // Not all of the 4x4 block fits.
+        // Store to dst_tmp_buf
+        "str q16, [%[dst_tmp_buf], #0]\n"
+        "str q17, [%[dst_tmp_buf], #16]\n"
+        "str q18, [%[dst_tmp_buf], #32]\n"
+        "str q19, [%[dst_tmp_buf], #48]\n"
+        // Slow loop copying from dst_tmp_buf to dst.
+        "mov x3, %[dst_tmp_buf]\n"
+        "mov w6, #0\n"
+        "50:\n"
+        "mov w5, #0\n"
+        "51:\n"
+        "ldr w7, [x3, x5, lsl #2]\n"
+        "str w7, [x4, x5, lsl #2]\n"
+        "add w5, w5, #1\n"
+        "cmp w5, w1\n"
+        "blt 51b\n"
+        "add w6, w6, #1\n"
+        "add x3, x3, #16\n"
+        "add x4, x4, x11\n"
+        "cmp w6, w2\n"
+        "blt 50b\n"
+        "b 31f\n"
+        "30:\n"
+        // Yes, all of the 4x4 block fits.
+        "mov x3, x4\n"
+        "st1 {v16.s}[0], [x3], #4\n"
+        "add x4, x4, x11\n"
+        "st1 {v16.s}[1], [x3], #4\n"
+        "st1 {v16.s}[2], [x3], #4\n"
+        "st1 {v16.s}[3], [x3], #4\n"
+        "mov x3, x4\n"
+        "st1 {v17.s}[0], [x3], #4\n"
+        "add x4, x4, x11\n"
+        "st1 {v17.s}[1], [x3], #4\n"
+        "st1 {v17.s}[2], [x3], #4\n"
+        "st1 {v17.s}[3], [x3], #4\n"
+        "mov x3, x4\n"
+        "st1 {v18.s}[0], [x3], #4\n"
+        "add x4, x4, x11\n"
+        "st1 {v18.s}[1], [x3], #4\n"
+        "st1 {v18.s}[2], [x3], #4\n"
+        "st1 {v18.s}[3], [x3], #4\n"
+        "mov x3, x4\n"
+        "st1 {v19.s}[0], [x3], #4\n"
+        "add x4, x4, x11\n"
+        "st1 {v19.s}[1], [x3], #4\n"
+        "st1 {v19.s}[2], [x3], #4\n"
+        "st1 {v19.s}[3], [x3], #4\n"
+        "31:\n"
+
+        "add %[dst_ptr], %[dst_ptr], #16\n"
+
+        RUY_MAKE_ZERO(v16)
+        RUY_MAKE_ZERO(v17)
+        RUY_MAKE_ZERO(v18)
+        RUY_MAKE_ZERO(v19)
+
         RUY_STR(RUY_ASM_LABEL_AFTER_STORE) ":\n"
 
         // For the next block: perform the first few multiply-adds on the data
@@ -1398,6 +1507,12 @@ void Kernel8bitNeonInOrder(const KernelParams8bit<4, 4>& params) {
         "sub v17.4s, v17.4s, v11.4s\n"
         "sub v18.4s, v18.4s, v11.4s\n"
         "sub v19.4s, v19.4s, v11.4s\n"
+
+        // If the destination is int32, it means the user asks for the raw
+        // accumulators, no need for us to downquantize the value.
+        "cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT32) "\n"
+        "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT32) "f\n"
+
         "402:\n"
 
         // At this point we have computed the final int32 values. Now we
@@ -1876,6 +1991,130 @@ void Kernel8bitNeonInOrder(const KernelParams8bit<4, 4>& params) {
         RUY_MAKE_ZERO(v16)
         RUY_MAKE_ZERO(v17)
 
+        "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
+
+        RUY_STR(RUY_ASM_LABEL_STORE_INT32) ":\n"
+
+        "ldr x1, [%[lhs_ptr], #8]\n"
+        "ldr x2, [%[lhs_ptr], #24]\n"
+        "ldr x3, [%[lhs_ptr], #40]\n"
+        "ldr x4, [%[lhs_ptr], #56]\n"
+
+        "ins v0.d[1], x1\n"
+        "ldr x1, [%[rhs_ptr], #8]\n"
+        "ins v1.d[1], x2\n"
+        "ldr x2, [%[rhs_ptr], #24]\n"
+        "ins v2.d[1], x3\n"
+        "ldr x3, [%[rhs_ptr], #40]\n"
+        "ins v3.d[1], x4\n"
+        "ldr x4, [%[rhs_ptr], #56]\n"
+        "ins v4.d[1], x1\n"
+        "ins v5.d[1], x2\n"
+        "ins v6.d[1], x3\n"
+        "ins v7.d[1], x4\n"
+
+        // Since the store type is the same as the accum type, no need for
+        // downcast. There's also no need for clamp by min/max.
+
+        // At this point, v20 -- v31 aren't used anymore for the current block,
+        // so we can start clearing these accumulators for the next block
+        // (next iteration of the main loop).
+
+        RUY_MAKE_ZERO(v20)
+        "add %[lhs_ptr], %[lhs_ptr], #64\n"
+        RUY_MAKE_ZERO(v21)
+        "add %[rhs_ptr], %[rhs_ptr], #64\n"
+        RUY_MAKE_ZERO(v22)
+
+        RUY_MAKE_ZERO(v23)
+        RUY_MAKE_ZERO(v24)
+        RUY_MAKE_ZERO(v25)
+        RUY_MAKE_ZERO(v26)
+        RUY_MAKE_ZERO(v27)
+        RUY_MAKE_ZERO(v28)
+        RUY_MAKE_ZERO(v29)
+        RUY_MAKE_ZERO(v30)
+
+        // Compute how much of the 4x4 block of destination 8bit values that
+        // we have computed, fit in the destination matrix. Typically, all of
+        // it fits, but when the destination matrix shape is not a multiple
+        // of 4x4, there are some 4x4 blocks along the boundaries that do
+        // not fit entirely.
+        "sub w1, %w[dst_rows], %w[row]\n"
+        RUY_MAKE_ZERO(v31)
+        "sub w2, %w[dst_cols], %w[col]\n"
+        "mov w3, #4\n"
+        "cmp w1, #4\n"
+        // Compute w1 = how many rows of the 4x4 block fit
+        "csel w1, w1, w3, le\n"
+        "cmp w2, #4\n"
+        // Compute w2 = how many cols of the 4x4 block fit
+        "csel w2, w2, w3, le\n"
+
+        // Test if w1==4 && w2 == 4, i.e. if all of the 8x8 block fits.
+        "cmp w1, w3\n"
+        "ccmp w2, w3, 0, eq\n"
+        "mov x4, %[dst_ptr]\n"
+        // Yes, all of the 4x4 block fits, go to fast path.
+        "beq 30f\n"
+        // Not all of the 4x4 block fits.
+        // Store to dst_tmp_buf
+        "str q16, [%[dst_tmp_buf], #0]\n"
+        "str q17, [%[dst_tmp_buf], #16]\n"
+        "str q18, [%[dst_tmp_buf], #32]\n"
+        "str q19, [%[dst_tmp_buf], #48]\n"
+        // Slow loop copying from dst_tmp_buf to dst.
+        "mov x3, %[dst_tmp_buf]\n"
+        "mov w6, #0\n"
+        "50:\n"
+        "mov w5, #0\n"
+        "51:\n"
+        "ldr w7, [x3, x5, lsl #2]\n"
+        "str w7, [x4, x5, lsl #2]\n"
+        "add w5, w5, #1\n"
+        "cmp w5, w1\n"
+        "blt 51b\n"
+        "add w6, w6, #1\n"
+        "add x3, x3, #16\n"
+        "add x4, x4, x11\n"
+        "cmp w6, w2\n"
+        "blt 50b\n"
+        "b 31f\n"
+        "30:\n"
+        // Yes, all of the 4x4 block fits.
+        "mov x3, x4\n"
+        "st1 {v16.s}[0], [x3], #4\n"
+        "add x4, x4, x11\n"
+        "st1 {v16.s}[1], [x3], #4\n"
+        "st1 {v16.s}[2], [x3], #4\n"
+        "st1 {v16.s}[3], [x3], #4\n"
+        "mov x3, x4\n"
+        "st1 {v17.s}[0], [x3], #4\n"
+        "add x4, x4, x11\n"
+        "st1 {v17.s}[1], [x3], #4\n"
+        "st1 {v17.s}[2], [x3], #4\n"
+        "st1 {v17.s}[3], [x3], #4\n"
+        "mov x3, x4\n"
+        "st1 {v18.s}[0], [x3], #4\n"
+        "add x4, x4, x11\n"
+        "st1 {v18.s}[1], [x3], #4\n"
+        "st1 {v18.s}[2], [x3], #4\n"
+        "st1 {v18.s}[3], [x3], #4\n"
+        "mov x3, x4\n"
+        "st1 {v19.s}[0], [x3], #4\n"
+        "add x4, x4, x11\n"
+        "st1 {v19.s}[1], [x3], #4\n"
+        "st1 {v19.s}[2], [x3], #4\n"
+        "st1 {v19.s}[3], [x3], #4\n"
+        "31:\n"
+
+        "add %[dst_ptr], %[dst_ptr], #16\n"
+
+        RUY_MAKE_ZERO(v16)
+        RUY_MAKE_ZERO(v17)
+        RUY_MAKE_ZERO(v18)
+        RUY_MAKE_ZERO(v19)
+
         RUY_STR(RUY_ASM_LABEL_AFTER_STORE) ":\n"
 
         // For the next block: perform the first few multiply-adds on the data
@@ -2440,6 +2679,10 @@ void Kernel8bitNeonDotprodOutOfOrder(const KernelParams8bit<8, 8>& params) {
         "sub v29.4s, v29.4s, v12.4s\n"
         "sub v30.4s, v30.4s, v11.4s\n"
         "sub v31.4s, v31.4s, v12.4s\n"
+
+        "cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT32) "\n"
+        "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT32) "f\n"
+
         "402:\n"
 
         // At this point we have computed the final int32 values. Now we
@@ -3041,7 +3284,7 @@ void Kernel8bitNeonDotprodOutOfOrder(const KernelParams8bit<8, 8>& params) {
         "mov x4, x11\n"
         "231:\n"
 
-        // Write our 8bit values to the destination described by
+        // Write our 16bit values to the destination described by
         // (x3 address, x4 stride).
         "st1 {v16.8h}, [x3], x4\n"
         RUY_MAKE_ZERO(v16)
@@ -3094,6 +3337,159 @@ void Kernel8bitNeonDotprodOutOfOrder(const KernelParams8bit<8, 8>& params) {
         // At this point we have completely finished writing values to the
         // destination matrix for the current block.
 
+        "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
+
+        RUY_STR(RUY_ASM_LABEL_STORE_INT32) ":\n"
+
+        // Since the store type is the same as the accum type, no need for
+        // downcast. There's also no need for clamp by min/max.
+
+        // Compute how much of the 8x8 block of destination 32it values that
+        // we have computed, fit in the destination matrix. Typically, all of
+        // it fits, but when the destination matrix shape is not a multiple
+        // of 8x8, there are some 8x8 blocks along the boundaries that do
+        // not fit entirely.
+        "sub w1, %w[dst_rows], %w[row]\n"
+        "sub w2, %w[dst_cols], %w[col]\n"
+        "mov w3, #8\n"
+        "cmp w1, #8\n"
+        // Compute w1 = how many rows of the 8x8 block fit
+        "csel w1, w1, w3, le\n"
+        "cmp w2, #8\n"
+        // Compute w1 = how many rows of the 8x8 block fit
+        "csel w2, w2, w3, le\n"
+
+        // Test if w1==8 && w2 == 8, i.e. if all of the 8x8 block fits.
+        "cmp w1, w3\n"
+        "ccmp w2, w3, 0, eq\n"
+        // Yes, all of the 8x8 block fits, go to fast path.
+        "beq 330f\n"
+        // Not all of the 8x8 block fits.
+        // Set (x3 address, x4 stride) to write to dst_tmp_buf
+        "mov x3, %[dst_tmp_buf]\n"
+        "mov x4, #16\n"
+
+        // Write our 32bit values to the destination described by
+        // (x3 address, x4 stride).
+        "st1 {v16.4s}, [x3], x4\n"
+        RUY_MAKE_ZERO(v16)
+        "st1 {v17.4s}, [x3], x4\n"
+        RUY_MAKE_ZERO(v17)
+        "st1 {v18.4s}, [x3], x4\n"
+        RUY_MAKE_ZERO(v18)
+        "st1 {v19.4s}, [x3], x4\n"
+        RUY_MAKE_ZERO(v19)
+        "st1 {v20.4s}, [x3], x4\n"
+        RUY_MAKE_ZERO(v20)
+        "st1 {v21.4s}, [x3], x4\n"
+        RUY_MAKE_ZERO(v21)
+        "st1 {v22.4s}, [x3], x4\n"
+        RUY_MAKE_ZERO(v22)
+        "st1 {v23.4s}, [x3], x4\n"
+        RUY_MAKE_ZERO(v23)
+        "st1 {v24.4s}, [x3], x4\n"
+        RUY_MAKE_ZERO(v24)
+        "st1 {v25.4s}, [x3], x4\n"
+        RUY_MAKE_ZERO(v25)
+        "st1 {v26.4s}, [x3], x4\n"
+        RUY_MAKE_ZERO(v26)
+        "st1 {v27.4s}, [x3], x4\n"
+        RUY_MAKE_ZERO(v27)
+        "st1 {v28.4s}, [x3], x4\n"
+        RUY_MAKE_ZERO(v28)
+        "st1 {v29.4s}, [x3], x4\n"
+        RUY_MAKE_ZERO(v29)
+        "st1 {v30.4s}, [x3], x4\n"
+        RUY_MAKE_ZERO(v30)
+        "st1 {v31.4s}, [x3], x4\n"
+        RUY_MAKE_ZERO(v31)
+
+        "b 331f\n"
+
+        "330:\n"
+        // Yes, all of the 8x8 block fits.
+        // Set (x3 address, x4 stride) to write directly to destination matrix.
+        "mov x4, %[dst_ptr]\n"
+        "mov x3, x4\n"
+
+        // Write our 32bit values to the destination described by
+        // (x3 address, x4 stride).
+        "st1 {v16.4s, v17.4s}, [x3], #32\n"
+        RUY_MAKE_ZERO(v16)
+        RUY_MAKE_ZERO(v17)
+        "add x4, x4, x11\n"
+        "mov x3, x4\n"
+        "st1 {v18.4s, v19.4s}, [x3], #32\n"
+        RUY_MAKE_ZERO(v18)
+        RUY_MAKE_ZERO(v19)
+        "add x4, x4, x11\n"
+        "mov x3, x4\n"
+        "st1 {v20.4s, v21.4s}, [x3], #32\n"
+        RUY_MAKE_ZERO(v20)
+        RUY_MAKE_ZERO(v21)
+        "add x4, x4, x11\n"
+        "mov x3, x4\n"
+        "st1 {v22.4s, v23.4s}, [x3], #32\n"
+        RUY_MAKE_ZERO(v22)
+        RUY_MAKE_ZERO(v23)
+        "add x4, x4, x11\n"
+        "mov x3, x4\n"
+        "st1 {v24.4s, v25.4s}, [x3], #32\n"
+        RUY_MAKE_ZERO(v24)
+        RUY_MAKE_ZERO(v25)
+        "add x4, x4, x11\n"
+        "mov x3, x4\n"
+        "st1 {v26.4s, v27.4s}, [x3], #32\n"
+        RUY_MAKE_ZERO(v26)
+        RUY_MAKE_ZERO(v27)
+        "add x4, x4, x11\n"
+        "mov x3, x4\n"
+        "st1 {v28.4s, v29.4s}, [x3], #32\n"
+        RUY_MAKE_ZERO(v28)
+        RUY_MAKE_ZERO(v29)
+        "add x4, x4, x11\n"
+        "mov x3, x4\n"
+        "st1 {v30.4s, v31.4s}, [x3], #32\n"
+        RUY_MAKE_ZERO(v30)
+        RUY_MAKE_ZERO(v31)
+
+        "331:\n"
+
+        // For the next block: perform the first few multiply-adds on the data
+        // that we have already loaded.
+        ".word 0x4f82e010  // sdot v16.4s, v0.16b, v2.4b[0]\n"
+        ".word 0x4fa2e012  // sdot v18.4s, v0.16b, v2.4b[1]\n"
+        ".word 0x4f82e814  // sdot v20.4s, v0.16b, v2.4b[2]\n"
+        ".word 0x4fa2e816  // sdot v22.4s, v0.16b, v2.4b[3]\n"
+
+        // If all of the 8x8 block fits, we just finished writing it to the
+        // destination, so we skip the next part.
+        "beq 341f\n"
+
+        // Not all of the 8x8 block fits in the destination matrix.  We just
+        // wrote it to dst_tmp_buf. Now we perform the slow scalar loop over
+        // it to copy into the destination matrix the part that fits.
+        "mov x3, %[dst_tmp_buf]\n"
+        "mov x4, %[dst_ptr]\n"
+        "mov w6, #0\n"
+        "350:\n"
+        "mov w5, #0\n"
+        "351:\n"
+        "ldr w7, [x3, x5, lsl #2]\n"
+        "str w7, [x4, x5, lsl #2]\n"
+        "add w5, w5, #1\n"
+        "cmp w5, w1\n"
+        "blt 351b\n"
+        "add w6, w6, #1\n"
+        "add x3, x3, #32\n"
+        "add x4, x4, x11\n"
+        "cmp w6, w2\n"
+        "blt 350b\n"
+        "341:\n"
+        "add %[dst_ptr], %[dst_ptr], #32\n"
+        // At this point we have completely finished writing values to the
+        // destination matrix for the current block.
+
         RUY_STR(RUY_ASM_LABEL_AFTER_STORE) ":\n"
 
         // Reload some params --- we had used x5 -- x7 for a few other things
diff --git a/tensorflow/lite/experimental/ruy/kernel.h b/tensorflow/lite/experimental/ruy/kernel.h
index 0318067d4a7..b5c6c043692 100644
--- a/tensorflow/lite/experimental/ruy/kernel.h
+++ b/tensorflow/lite/experimental/ruy/kernel.h
@@ -197,6 +197,7 @@ RUY_INHERIT_KERNEL(Path::kNeon, Path::kNeonDotprod)
 #define RUY_ASM_TYPE_ID_UINT8 1
 #define RUY_ASM_TYPE_ID_INT8 2
 #define RUY_ASM_TYPE_ID_INT16 3
+#define RUY_ASM_TYPE_ID_INT32 4
 
 template <typename DstScalar>
 struct DstTypeId {};
@@ -216,9 +217,14 @@ struct DstTypeId<std::int16_t> {
   static constexpr int kValue = RUY_ASM_TYPE_ID_INT16;
 };
 
+template <>
+struct DstTypeId<std::int32_t> {
+  static constexpr int kValue = RUY_ASM_TYPE_ID_INT32;
+};
+
 template <int LhsCols, int RhsCols>
 struct KernelParams8bit {
-  static constexpr int kMaxDstTypeSize = 2;
+  static constexpr int kMaxDstTypeSize = 4;
 
   const std::int32_t* bias;
   const std::int32_t* lhs_sums;
@@ -242,8 +248,8 @@ struct KernelParams8bit {
   std::int32_t rhs_stride;
   std::int32_t dst_stride;
   std::int32_t depth;
-  std::int16_t clamp_min;
-  std::int16_t clamp_max;
+  std::int32_t clamp_min;
+  std::int32_t clamp_max;
   std::uint8_t flags;
   std::uint8_t dst_type_id;
   const std::int32_t zero_data[LhsCols] = {0};
@@ -365,7 +371,9 @@ struct Kernel<Path::kNeonDotprod, std::int8_t, std::int8_t, DstScalar,
     KernelParams8bit<LhsLayout::kCols, RhsLayout::kCols> params;
     MakeKernelParams8bit(lhs, rhs, spec, start_row, start_col, end_row, end_col,
                          dst, &params);
-    if (__builtin_expect(tuning == Tuning::kInOrder, true)) {
+    // TODO(renjieliu): Add support for in order case.
+    if (__builtin_expect(tuning == Tuning::kInOrder, true) &&
+        !std::is_same<DstScalar, std::int32_t>::value) {
       Kernel8bitNeonDotprodInOrder(params);
     } else {
       Kernel8bitNeonDotprodOutOfOrder(params);
diff --git a/tensorflow/lite/experimental/ruy/matrix.h b/tensorflow/lite/experimental/ruy/matrix.h
index 49b7c1df14a..7a3351ba1ac 100644
--- a/tensorflow/lite/experimental/ruy/matrix.h
+++ b/tensorflow/lite/experimental/ruy/matrix.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_MATRIX_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_MATRIX_H_
 
+#include <cstddef>
 #include <cstdint>
 #include <type_traits>
 
@@ -52,8 +53,18 @@ class ConstCheckingPtr final {
   using element_type = T;
 
   // Convenience methods. Most `set` calls go through these.
-  void operator=(T* ptr) { set(ptr); }
-  void operator=(const T* ptr) { set(ptr); }
+  ConstCheckingPtr& operator=(T* ptr) {
+    set(ptr);
+    return *this;
+  }
+  ConstCheckingPtr& operator=(const T* ptr) {
+    set(ptr);
+    return *this;
+  }
+  ConstCheckingPtr& operator=(std::nullptr_t) {
+    set(static_cast<T*>(nullptr));
+    return *this;
+  }
 
   // Core accessors. These encapsulate the main logic:
   // - for `set`, the constness of the argument determines whether internal
@@ -117,6 +128,15 @@ inline void MakeSimpleLayout(int rows, int cols, Order order, Layout* layout) {
   layout->stride = order == Order::kColMajor ? rows : cols;
 }
 
+// Opaque data structure representing a pre-packed matrix, as obtained from
+// Ruy's advanced API.
+struct PrepackedMatrix {
+  void* data = nullptr;
+  std::size_t data_size = 0;
+  void* sums = nullptr;
+  std::size_t sums_size = 0;
+};
+
 template <typename StreamType, typename Scalar>
 StreamType& operator<<(StreamType& stream, const Matrix<Scalar>& mat) {
   for (int row = 0; row < mat.layout.rows; row++) {
diff --git a/tensorflow/lite/experimental/ruy/pack.cc b/tensorflow/lite/experimental/ruy/pack.cc
index 801210dc35d..d21c9841a5f 100644
--- a/tensorflow/lite/experimental/ruy/pack.cc
+++ b/tensorflow/lite/experimental/ruy/pack.cc
@@ -1145,11 +1145,19 @@ void PackFloatNeonOutOfOrder(const float* src_ptr0, const float* src_ptr1,
           "trn1 v21.2d, v17.2d, v19.2d\n"
           "trn2 v23.2d, v17.2d, v19.2d\n"
 
-          "str q20, [%[packed_ptr], #0]\n"
-          "str q21, [%[packed_ptr], #32]\n"
-          "str q22, [%[packed_ptr], #64]\n"
-          "str q23, [%[packed_ptr], #96]\n"
-          "add %[packed_ptr], %[packed_ptr], #128\n"
+          "mov x1, #32\n"
+
+#define RUY_STORE_ONE_ROW(ROW, REGISTER)                  \
+          "cmp w2, #" #ROW "\n"                           \
+          "beq 4f\n"                                      \
+          "st1 {" #REGISTER ".4s}, [%[packed_ptr]], x1\n"
+
+          RUY_STORE_ONE_ROW(0, v20)
+          RUY_STORE_ONE_ROW(1, v21)
+          RUY_STORE_ONE_ROW(2, v22)
+          RUY_STORE_ONE_ROW(3, v23)
+
+#undef RUY_STORE_ONE_ROW
 
           "4:\n"
 
@@ -1295,11 +1303,19 @@ void PackFloatNeonInOrder(const float* src_ptr0, const float* src_ptr1,
           "trn1 v21.2d, v17.2d, v19.2d\n"
           "trn2 v23.2d, v17.2d, v19.2d\n"
 
-          "str q20, [%[packed_ptr], #0]\n"
-          "str q21, [%[packed_ptr], #32]\n"
-          "str q22, [%[packed_ptr], #64]\n"
-          "str q23, [%[packed_ptr], #96]\n"
-          "add %[packed_ptr], %[packed_ptr], #128\n"
+          "mov x1, #32\n"
+
+#define RUY_STORE_ONE_ROW(ROW, REGISTER)                  \
+          "cmp w2, #" #ROW "\n"                           \
+          "beq 4f\n"                                      \
+          "st1 {" #REGISTER ".4s}, [%[packed_ptr]], x1\n"
+
+          RUY_STORE_ONE_ROW(0, v20)
+          RUY_STORE_ONE_ROW(1, v21)
+          RUY_STORE_ONE_ROW(2, v22)
+          RUY_STORE_ONE_ROW(3, v23)
+
+#undef RUY_STORE_ONE_ROW
 
           "4:\n"
 
diff --git a/tensorflow/lite/experimental/ruy/pack.h b/tensorflow/lite/experimental/ruy/pack.h
index 753a202e69e..310fcb086a6 100644
--- a/tensorflow/lite/experimental/ruy/pack.h
+++ b/tensorflow/lite/experimental/ruy/pack.h
@@ -13,6 +13,73 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+// # What is "packing"?
+//
+// Before feeding data to the gemm kernels (the parts of Ruy that do lots
+// of multiply-add operations), Ruy first performs a data transformation (which
+// we call "packing") on the input matrices. This transformation has two main
+// goals:
+// - rearrange data into blocks that are a convenient size/layout for the gemm
+// kernels to consume. This helps make the memory access pattern of the gemm
+// kernel simpler and more contiguous, and puts the data in a layout most
+// convenient for specific arithmetic instructions in the gemm kernel.
+// - compute row/column sums needed for handling quantization with non-symmetric
+// zero points.
+//
+// # Simplified algorithmic analysis of packing
+//
+// Packing is a relatively simple transformation which does a small constant
+// amount of work on each element of an input matrix, and hence for an NxM
+// matrix performs O(N*M) work. If N and M are of the same order, then this is
+// O(N^2) work.
+//
+// A NxKxM matrix multiplication requires N*K*M multiply-accumulate operations.
+// Note that if N, K, and M are all the same order, then the number of
+// multiply-accumulate operations is O(N^3).
+//
+// Thus, the O(N^2) cost of packing is small compared to the O(N^3) work, in the
+// case of all dimensions being roughly the same order.
+//
+// # Packing cost can be significant
+//
+// When matrix * matrix multiplications begin to look more like matrix * vector
+// multiplications, packing cost can become significant. We sometimes call these
+// cases "gemv-like".
+//
+// Continuing the algorithmic analysis above, if we consider a case where an
+// NxKxM matrix multiplication has either N = O(1) or M = O(1), then the
+// situation is different. In this case, the multiply-accumulate work is only
+// quadratic, so the quadratic cost of packing can be come significant.
+//
+// Another way to say this is that the cost of packing an input matrix (either
+// the LHS or RHS) is amortized across the non-depth dimension of the opposite
+// input matrix. Thus, when the LHS has very few rows or the RHS has very few
+// columns, the cost of packing the opposite input matrix can become
+// significant.
+//
+// As a rough rule of thumb, the cost of packing starts to become significant
+// when either N or M is below 32 (and other dimensions are hundreds), with very
+// significant packing costs at 8 or below. This varies by data type, Path, and
+// tuning, so these numbers are only rough guides.
+//
+// One practical use case that is affected by this is inference of
+// fully connected neural network layers with a low batch size. The weight
+// matrix (which is a constant for inference) is the one affected by significant
+// packing cost.
+//
+// Ruy provides an API in ruy_advanced.h for advanced users to pre-pack
+// input matrices that are affected by significant packing costs.
+//
+// # Implementation notes
+//
+// Ruy's packing routines always operate on a range of columns and can be
+// applied to either the LHS or RHS. This is possible because Ruy internally
+// implements a TrMul, so the accumulation along depth is done along columns of
+// both the LHS and RHS (whereas for a normal Mul the accumulation along depth
+// for the LHS is along rows). As another example, we are always computing
+// column sums for quantization (and never row sums, since the LHS is
+// transposed).
+
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_PACK_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_PACK_H_
 
@@ -262,7 +329,7 @@ void PackFloatNeonInOrder(const float* src_ptr0, const float* src_ptr1,
                           float* packed_ptr, int start_col, int end_col);
 
 template <>
-struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kColMajor, 1, 8>, float,
+struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kRowMajor, 1, 8>, float,
                 float, float> {
   static void Run(Tuning tuning, const Matrix<float>& src_matrix,
                   PackedMatrix<float>* packed_matrix, int start_col,
diff --git a/tensorflow/lite/experimental/ruy/pmu.cc b/tensorflow/lite/experimental/ruy/pmu.cc
index b69fd80381f..2fe79e7ec78 100644
--- a/tensorflow/lite/experimental/ruy/pmu.cc
+++ b/tensorflow/lite/experimental/ruy/pmu.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/lite/experimental/ruy/pmu.h"
 
+#include "tensorflow/lite/experimental/ruy/check_macros.h"
+
 #ifdef __linux__
 #include <asm/unistd.h>
 #include <linux/perf_event.h>
@@ -54,7 +56,7 @@ class PerfEvent {
 
   void Stop() {
     ioctl(fd_, PERF_EVENT_IOC_DISABLE, 0);
-    read(fd_, &count_, sizeof(count_));
+    RUY_CHECK_NE(read(fd_, &count_, sizeof(count_)), -1);
     close(fd_);
   }
 
diff --git a/tensorflow/lite/experimental/ruy/prepack.h b/tensorflow/lite/experimental/ruy/prepack.h
new file mode 100644
index 00000000000..60fd9465c90
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/prepack.h
@@ -0,0 +1,107 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Implementation of low-level pre-packing API.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_PREPACK_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_PREPACK_H_
+
+#include <functional>
+
+#include "tensorflow/lite/experimental/ruy/context.h"
+#include "tensorflow/lite/experimental/ruy/dispatch.h"
+#include "tensorflow/lite/experimental/ruy/matrix.h"
+#include "tensorflow/lite/experimental/ruy/path.h"
+#include "tensorflow/lite/experimental/ruy/spec.h"
+
+namespace ruy {
+
+template <Path CompiledPaths, typename LhsScalar, typename RhsScalar,
+          typename DstScalar, typename Spec>
+void PrePackForMulInternal(const Matrix<LhsScalar>& lhs,
+                           const Matrix<RhsScalar>& rhs, const Spec& spec,
+                           Context* context, Matrix<DstScalar>* dst,
+                           PrepackedMatrix* prepacked_lhs,
+                           PrepackedMatrix* prepacked_rhs,
+                           std::function<void*(std::size_t)> alloc_fn) {
+  gemmlowp::ScopedProfilingLabel label("PrePackForMul");
+  Path the_path = context->GetPathToTake<CompiledPaths>();
+  RUY_CHECK(the_path != Path::kReference);
+  constexpr Path TrMulCompiledPaths = CompiledPaths & ~Path::kReference;
+  Matrix<LhsScalar> transposed_lhs(lhs);
+  Transpose(&transposed_lhs);
+  TrMulParams params;
+  CreateTrMulParams<TrMulCompiledPaths>(transposed_lhs, rhs, spec, context, dst,
+                                        the_path, &params);
+
+  Tuning tuning = GetTuning(context);
+  if (prepacked_lhs) {
+    prepacked_lhs->data_size = DataSize(params.packed_lhs);
+    prepacked_lhs->sums_size = SumsSize(params.packed_lhs);
+    prepacked_lhs->data = alloc_fn(prepacked_lhs->data_size);
+    prepacked_lhs->sums = alloc_fn(prepacked_lhs->sums_size);
+    params.packed_lhs.data = prepacked_lhs->data;
+    params.packed_lhs.sums = prepacked_lhs->sums;
+    params.LhsRunPack(tuning, 0, params.packed_lhs.layout.cols);
+  }
+  if (prepacked_rhs) {
+    prepacked_rhs->data_size = DataSize(params.packed_rhs);
+    prepacked_rhs->sums_size = SumsSize(params.packed_rhs);
+    prepacked_rhs->data = alloc_fn(prepacked_rhs->data_size);
+    prepacked_rhs->sums = alloc_fn(prepacked_rhs->sums_size);
+    params.packed_rhs.data = prepacked_rhs->data;
+    params.packed_rhs.sums = prepacked_rhs->sums;
+    params.RhsRunPack(tuning, 0, params.packed_rhs.layout.cols);
+  }
+}
+
+template <Path CompiledPaths, typename LhsScalar, typename RhsScalar,
+          typename DstScalar, typename Spec>
+void MulWithPrepackedInternal(const Matrix<LhsScalar>& lhs,
+                              const Matrix<RhsScalar>& rhs, const Spec& spec,
+                              Context* context, Matrix<DstScalar>* dst,
+                              PrepackedMatrix* prepacked_lhs,
+                              PrepackedMatrix* prepacked_rhs) {
+  gemmlowp::ScopedProfilingLabel label("MulWithPrepacked");
+
+  EnforceLayoutSupport<Spec>(lhs.layout, rhs.layout, dst->layout);
+  EnforceZeroPointSupport<Spec>(lhs.zero_point, rhs.zero_point,
+                                dst->zero_point);
+
+  Path the_path = context->GetPathToTake<CompiledPaths>();
+  RUY_CHECK(the_path != Path::kReference);
+  constexpr Path TrMulCompiledPaths = CompiledPaths & ~Path::kReference;
+  Matrix<LhsScalar> transposed_lhs(lhs);
+  Transpose(&transposed_lhs);
+  TrMulParams params;
+  CreateTrMulParams<TrMulCompiledPaths>(transposed_lhs, rhs, spec, context, dst,
+                                        the_path, &params);
+
+  if (prepacked_lhs) {
+    params.packed_lhs.data = prepacked_lhs->data;
+    params.packed_lhs.sums = prepacked_lhs->sums;
+    params.lhs_is_prepacked = true;
+  }
+  if (prepacked_rhs) {
+    params.packed_rhs.data = prepacked_rhs->data;
+    params.packed_rhs.sums = prepacked_rhs->sums;
+    params.rhs_is_prepacked = true;
+  }
+  TrMul(&params, context);
+}
+
+}  // namespace ruy
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_PREPACK_H_
diff --git a/tensorflow/lite/experimental/ruy/ruy_advanced.h b/tensorflow/lite/experimental/ruy/ruy_advanced.h
new file mode 100644
index 00000000000..36382e7d8e5
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/ruy_advanced.h
@@ -0,0 +1,60 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_ADVANCED_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_ADVANCED_H_
+
+#include "tensorflow/lite/experimental/ruy/prepack.h"
+
+namespace ruy {
+
+// Low-level, explicit pre-packing API.
+//
+// The cost of packing an input matrix (either the LHS or RHS) is amortized
+// across the non-depth dimension of the opposite input matrix. Thus, when the
+// LHS has very few rows or the RHS has very few columns, the cost of packing
+// the opposite input matrix can become significant. See pack.h for further
+// information on packing.
+//
+// This file provides an API allowing a user to explicitly pack a matrix and
+// reuse the pre-packed matrix, avoiding that cost.
+//
+// See example_prepack.cc for example usage.
+
+template <Path CompiledPaths, typename LhsScalar, typename RhsScalar,
+          typename DstScalar, typename Spec>
+void PrePackForMul(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
+                   const Spec& spec, Context* context, Matrix<DstScalar>* dst,
+                   PrepackedMatrix* prepacked_lhs,
+                   PrepackedMatrix* prepacked_rhs,
+                   std::function<void*(std::size_t)> alloc_fn) {
+  PrePackForMulInternal<CompiledPaths>(lhs, rhs, spec, context, dst,
+                                       prepacked_lhs, prepacked_rhs, alloc_fn);
+}
+
+template <Path CompiledPaths, typename LhsScalar, typename RhsScalar,
+          typename DstScalar, typename Spec>
+void MulWithPrepacked(const Matrix<LhsScalar>& lhs,
+                      const Matrix<RhsScalar>& rhs, const Spec& spec,
+                      Context* context, Matrix<DstScalar>* dst,
+                      PrepackedMatrix* prepacked_lhs,
+                      PrepackedMatrix* prepacked_rhs) {
+  MulWithPrepackedInternal<CompiledPaths>(lhs, rhs, spec, context, dst,
+                                          prepacked_lhs, prepacked_rhs);
+}
+
+}  // namespace ruy
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_ADVANCED_H_
diff --git a/tensorflow/lite/experimental/ruy/ruy_test.bzl b/tensorflow/lite/experimental/ruy/ruy_test.bzl
index c0a5000e879..df9f58ce653 100644
--- a/tensorflow/lite/experimental/ruy/ruy_test.bzl
+++ b/tensorflow/lite/experimental/ruy/ruy_test.bzl
@@ -6,7 +6,7 @@ corresponding to tuples of types for LHS, RHS, accumulator
 and destination.
 """
 
-def ruy_test(name, srcs, lhs_rhs_accum_dst):
+def ruy_test(name, srcs, lhs_rhs_accum_dst, tags = []):
     for (lhs, rhs, accum, dst) in lhs_rhs_accum_dst:
         native.cc_test(
             name = "%s_%s_%s_%s_%s" % (name, lhs, rhs, accum, dst),
@@ -21,6 +21,7 @@ def ruy_test(name, srcs, lhs_rhs_accum_dst):
                 "//tensorflow/lite/experimental/ruy:test_lib",
                 "@com_google_googletest//:gtest_main",
             ],
+            tags = tags,
         )
 
 def ruy_benchmark(name, srcs, lhs_rhs_accum_dst):
diff --git a/tensorflow/lite/experimental/ruy/test.h b/tensorflow/lite/experimental/ruy/test.h
index 9d32d3dec47..86b74e9f12c 100644
--- a/tensorflow/lite/experimental/ruy/test.h
+++ b/tensorflow/lite/experimental/ruy/test.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/lite/experimental/ruy/pmu.h"
 #include "tensorflow/lite/experimental/ruy/ruy.h"
+#include "tensorflow/lite/experimental/ruy/ruy_advanced.h"
 #include "tensorflow/lite/experimental/ruy/time.h"
 
 #ifdef RUY_TEST_EXTERNAL_PATHS
@@ -38,8 +39,8 @@ limitations under the License.
 #define EIGEN_USE_CUSTOM_THREAD_POOL
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "third_party/lapack/blas.h"
 #include "public/gemmlowp.h"
+#include "third_party/lapack/blas.h"
 #endif
 
 #ifdef GEMMLOWP_PROFILING
@@ -318,6 +319,9 @@ void MakeLayout(int rows, int cols, Order order, LayoutStyle layout_style,
 
 template <typename Scalar>
 struct StorageMatrix {
+  StorageMatrix() = default;
+  StorageMatrix(const StorageMatrix&) = delete;
+  void operator=(const StorageMatrix&) = delete;
   std::vector<Scalar> data;
   Matrix<Scalar> matrix;
 };
@@ -350,6 +354,8 @@ void MakeRandom(int rows, int cols, Order order, Scalar zero_point,
 
 template <typename Scalar>
 struct TestResult {
+  void operator=(const TestResult&) = delete;
+  void operator=(const TestResult&&) = delete;
   StorageMatrix<Scalar> storage_matrix;
   Path path = Path::kNone;
   Tuning tuning = Tuning::kAuto;
@@ -361,6 +367,14 @@ struct TestResult {
   float mispred_rate;
   float frontend_stall_rate;
   float backend_stall_rate;
+
+  // Per-path data for pre-packing.
+  // This is not used by external paths or by Path::kReference.
+  Allocator allocator;
+  PrepackedMatrix prepacked_lhs;
+  PrepackedMatrix prepacked_rhs;
+  bool use_prepacked_lhs = false;
+  bool use_prepacked_rhs = false;
 };
 
 template <typename Scalar>
@@ -389,6 +403,7 @@ struct TestSet final {
   using AccumScalar = typename SpecType::AccumScalar;
   using DstScalar = typename SpecType::DstScalar;
   using Spec = SpecType;
+  using TestResultType = TestResult<DstScalar>;
 
   void Run() {
     MakeZeroPoints();
@@ -396,6 +411,7 @@ struct TestSet final {
     MakeSpec();
     MakeOtherParams();
     MakeResultPaths();
+    MakePrepackedMatrices();
     Eval();
     Verify();
   }
@@ -405,13 +421,16 @@ struct TestSet final {
   void MakeLhsRhs();
   void MakeSpec();
   void MakeResultPaths();
+  void MakePrepackedMatrices();
   void MakeOtherParams();
   void EvalAndVerify();
   void Eval();
   void Verify();
 
-  void EvalResult(TestResult<DstScalar>* result);
-  void Benchmark(TestResult<DstScalar>* result);
+  void EvalResult(TestResultType* result);
+  void EvalRuy(TestResultType* result);
+  void DoMul(TestResultType* result);
+  void Benchmark(TestResultType* result);
   void VerifyTestResults() const;
   void VerifyNonTrivial() const;
 
@@ -423,6 +442,7 @@ struct TestSet final {
     kHasSpec,
     kHasOtherParams,
     kHasResultPaths,
+    kHasPrepackedMatrices,
     kEvaluated,
     kFinal
   };
@@ -455,7 +475,7 @@ struct TestSet final {
   StorageMatrix<RhsScalar> rhs;
   Spec spec;
   std::vector<AccumScalar> bias_data;
-  std::vector<TestResult<DstScalar>> results;
+  std::vector<std::unique_ptr<TestResultType>> results;
 
   std::vector<Path> paths;
   std::vector<ExternalPath> external_paths;
@@ -463,6 +483,8 @@ struct TestSet final {
   bool benchmark = false;
   bool perchannel = false;
   int max_num_threads = 0;
+  bool benchmark_prepack_lhs = false;
+  bool benchmark_prepack_rhs = false;
 };
 
 Context& GlobalContext() {
@@ -479,13 +501,40 @@ Context& GlobalContext() {
 #endif
 #endif  // defined(__has_feature)
 
-template <typename LhsScalar, typename RhsScalar, typename DstScalar,
-          typename Spec>
-void EvalRuy(Path path, Tuning tuning, const Matrix<LhsScalar>& lhs,
-             const Matrix<RhsScalar>& rhs, const Spec& spec,
-             Matrix<DstScalar>* dst, ExpectedOutcome expected_outcome,
-             bool benchmark, int max_num_threads) {
-  GlobalContext().explicit_tuning = tuning;
+template <typename LhsScalar, typename RhsScalar, typename SpecType>
+void TestSet<LhsScalar, RhsScalar, SpecType>::DoMul(TestResultType* result) {
+  Context* context = &GlobalContext();
+
+  if (!result->use_prepacked_lhs && !result->use_prepacked_rhs) {
+    Mul<kAllPaths>(lhs.matrix, rhs.matrix, spec, context,
+                   &result->storage_matrix.matrix);
+    return;
+  }
+
+  // If we prepacked an input matrix, null out its data pointer to check
+  // that we don't access any data through it.
+  Matrix<LhsScalar> null_data_lhs = lhs.matrix;
+  Matrix<RhsScalar> null_data_rhs = rhs.matrix;
+  if (result->use_prepacked_lhs) {
+    null_data_lhs.data = nullptr;
+  }
+  if (result->use_prepacked_rhs) {
+    null_data_rhs.data = nullptr;
+  }
+
+  // Do the multiplication with pre-packed matrices.
+  PrepackedMatrix* prepacked_lhs_ptr =
+      result->use_prepacked_lhs ? &result->prepacked_lhs : nullptr;
+  PrepackedMatrix* prepacked_rhs_ptr =
+      result->use_prepacked_rhs ? &result->prepacked_rhs : nullptr;
+  MulWithPrepacked<kAllPaths>(null_data_lhs, null_data_rhs, spec, context,
+                              &result->storage_matrix.matrix, prepacked_lhs_ptr,
+                              prepacked_rhs_ptr);
+}
+
+template <typename LhsScalar, typename RhsScalar, typename SpecType>
+void TestSet<LhsScalar, RhsScalar, SpecType>::EvalRuy(TestResultType* result) {
+  GlobalContext().explicit_tuning = result->tuning;
   if (max_num_threads) {
     GlobalContext().max_num_threads = max_num_threads;
   } else if (benchmark) {
@@ -493,15 +542,15 @@ void EvalRuy(Path path, Tuning tuning, const Matrix<LhsScalar>& lhs,
   } else {
     GlobalContext().max_num_threads = 1 + global_random_engine()() % 8;
   }
-  GlobalContext().SetRuntimeEnabledPaths(path);
+  GlobalContext().SetRuntimeEnabledPaths(result->path);
   if (expected_outcome == ExpectedOutcome::kSuccess) {
-    Mul<kAllPaths>(lhs, rhs, spec, &GlobalContext(), dst);
-    RUY_CHECK(GlobalContext().last_taken_path == path);
+    DoMul(result);
+    RUY_CHECK(GlobalContext().last_taken_path == result->path);
   } else if (expected_outcome == ExpectedOutcome::kDeath) {
     // TODO(benoitjacob) TSan and ASan seem to be breaking ASSERT_DEATH.
     // Report a bug?
 #if (!defined NDEBUG) && (!defined RUY_ASAN) && (!defined RUY_TSAN)
-    ASSERT_DEATH(Mul<kAllPaths>(lhs, rhs, spec, &GlobalContext(), dst), "");
+    ASSERT_DEATH(DoMul(result), "");
 #endif
   } else {
     RUY_CHECK(false);
@@ -1194,9 +1243,9 @@ struct ErrorAnalysis {
 template <typename TestSetType>
 void AnalyzeTestError(const TestSetType& test_set, int first_bad_result_index,
                       ErrorAnalysis* error_analysis) {
-  const auto& good_matrix = test_set.results[0].storage_matrix.matrix;
+  const auto& good_matrix = test_set.results[0]->storage_matrix.matrix;
   const auto& bad_matrix =
-      test_set.results[first_bad_result_index].storage_matrix.matrix;
+      test_set.results[first_bad_result_index]->storage_matrix.matrix;
   GetMatrixStats(good_matrix, &error_analysis->stats_good);
   GetMatrixStats(bad_matrix, &error_analysis->stats_bad);
   bool found_first_error = false;
@@ -1396,11 +1445,14 @@ void MakeSpecClampFields(const Matrix<LhsScalar>& lhs,
   spec_unclamped.multiplier_exponent_perchannel =
       spec->multiplier_exponent_perchannel;
   Mul<Path::kReference>(lhs, rhs, spec_unclamped, &context, &unclamped_dst);
-  std::sort(unclamped_dst_data.begin(), unclamped_dst_data.end());
-  const int clamp_count = static_cast<int>(std::floor(kClampRatio * size));
-  RUY_CHECK_LT(clamp_count, size);
-  spec->clamp_min = unclamped_dst_data[clamp_count];
-  spec->clamp_max = unclamped_dst_data[size - 1 - clamp_count];
+  // If dst is std::int32_t, no need to set the clamp min/max.
+  if (!std::is_same<typename Spec::DstScalar, std::int32_t>::value) {
+    std::sort(unclamped_dst_data.begin(), unclamped_dst_data.end());
+    const int clamp_count = static_cast<int>(std::floor(kClampRatio * size));
+    RUY_CHECK_LT(clamp_count, size);
+    spec->clamp_min = unclamped_dst_data[clamp_count];
+    spec->clamp_max = unclamped_dst_data[size - 1 - clamp_count];
+  }
 }
 
 template <typename LhsScalar, typename RhsScalar, typename SpecType>
@@ -1409,7 +1461,12 @@ void TestSet<LhsScalar, RhsScalar, SpecType>::MakeZeroPoints() {
   if (!use_specified_zero_points) {
     MakeRandomScalar(RandomRange::kReasonableSrcZeroPoint, &lhs_zero_point);
     MakeRandomScalar(RandomRange::kReasonableSrcZeroPoint, &rhs_zero_point);
-    MakeRandomScalar(RandomRange::kReasonableDstZeroPoint, &dst_zero_point);
+    // If destination is std::int32_t, no dst_zero_point is necessary.
+    if (std::is_same<DstScalar, std::int32_t>::value) {
+      dst_zero_point = 0;
+    } else {
+      MakeRandomScalar(RandomRange::kReasonableDstZeroPoint, &dst_zero_point);
+    }
   }
   life_stage = LifeStage::kHasZeroPoints;
 }
@@ -1490,6 +1547,55 @@ std::vector<Tuning> EnumerateTuningsForPath(Path path, bool benchmark) {
   return {Tuning::kAuto};
 }
 
+template <typename LhsScalar, typename RhsScalar, typename SpecType>
+void TestSet<LhsScalar, RhsScalar, SpecType>::MakePrepackedMatrices() {
+  RUY_CHECK(life_stage == LifeStage::kHasResultPaths);
+
+  // Prepacked matrices are Path-dependent, so create them for each test result.
+  for (auto& result : results) {
+    // If this result uses an external path, then skip this entirely.
+    if (result->path == Path::kNone) {
+      continue;
+    }
+    // Pre-packing doesn't make sense for Path::kReference.
+    // TODO(silvasean): Make Path::kReference an ExternalPath?
+    if (result->path == Path::kReference) {
+      continue;
+    }
+
+    // Determine whether we should create/use prepacked matrices.
+    if (benchmark) {
+      // For benchmarking, do as requested.
+      result->use_prepacked_lhs = benchmark_prepack_lhs;
+      result->use_prepacked_rhs = benchmark_prepack_rhs;
+    } else {
+      // When testing, randomly pre-pack sometimes. But don't do it too often.
+      result->use_prepacked_lhs = (global_random_engine()() & 7) == 0;
+      result->use_prepacked_rhs = (global_random_engine()() & 7) == 0;
+    }
+
+    // Create the pre-packed matrices.
+    PrepackedMatrix* prepacked_lhs_ptr =
+        result->use_prepacked_lhs ? &result->prepacked_lhs : nullptr;
+    PrepackedMatrix* prepacked_rhs_ptr =
+        result->use_prepacked_rhs ? &result->prepacked_rhs : nullptr;
+    auto alloc_fn = [&result](std::size_t num_bytes) {
+      return result->allocator.AllocateBytes(num_bytes);
+    };
+    // Use a dst with a null data pointer to check that the pre-packing
+    // invocation doesn't write into it.
+    Matrix<DstScalar> null_data_dst = result->storage_matrix.matrix;
+    null_data_dst.data = nullptr;
+    GlobalContext().SetRuntimeEnabledPaths(result->path);
+    PrePackForMul<kAllPaths>(lhs.matrix, rhs.matrix, spec, &GlobalContext(),
+                             &null_data_dst, prepacked_lhs_ptr,
+                             prepacked_rhs_ptr, alloc_fn);
+    RUY_CHECK(GlobalContext().last_taken_path == result->path);
+  }
+
+  life_stage = LifeStage::kHasPrepackedMatrices;
+}
+
 template <typename LhsScalar, typename RhsScalar, typename SpecType>
 void TestSet<LhsScalar, RhsScalar, SpecType>::MakeResultPaths() {
   RUY_CHECK(life_stage == LifeStage::kHasOtherParams);
@@ -1507,6 +1613,7 @@ void TestSet<LhsScalar, RhsScalar, SpecType>::MakeResultPaths() {
   // to allow specifying e.g. ffff to mean 'all paths' regardless of whether all
   // those bits exist as actual paths.
   paths_bitfield = paths_bitfield & kAllPaths;
+  RUY_CHECK(paths_bitfield != Path::kNone);
   paths = PathsBitfieldAsVector(paths_bitfield);
 
 #ifdef RUY_TEST_EXTERNAL_PATHS
@@ -1546,8 +1653,8 @@ void TestSet<LhsScalar, RhsScalar, SpecType>::MakeResultPaths() {
 
   for (Path path : paths) {
     for (Tuning tuning : EnumerateTuningsForPath(path, benchmark)) {
-      results.emplace_back();
-      TestResult<DstScalar>& result = results.back();
+      results.emplace_back(new TestResultType);
+      TestResultType& result = *results.back();
       result.path = path;
       result.tuning = tuning;
       MakeRandom(rows, cols, dst_order, dst_zero_point, layout_style,
@@ -1556,8 +1663,8 @@ void TestSet<LhsScalar, RhsScalar, SpecType>::MakeResultPaths() {
   }
 
   for (ExternalPath external_path : external_paths) {
-    results.emplace_back();
-    TestResult<DstScalar>& result = results.back();
+    results.emplace_back(new TestResultType);
+    TestResultType& result = *results.back();
     result.external_path = external_path;
     MakeRandom(rows, cols, dst_order, dst_zero_point, layout_style,
                RandomRange::kGeneral, &result.storage_matrix);
@@ -1572,9 +1679,7 @@ void TestSet<LhsScalar, RhsScalar, SpecType>::EvalResult(
   RUY_CHECK(result->path != Path::kNone ||
             result->external_path != ExternalPath::kNone);
   if (result->path != Path::kNone) {
-    EvalRuy(result->path, result->tuning, lhs.matrix, rhs.matrix, spec,
-            &result->storage_matrix.matrix, expected_outcome, benchmark,
-            max_num_threads);
+    EvalRuy(result);
   } else {
 #ifdef RUY_TEST_EXTERNAL_PATHS
     using TestSetType = TestSet<LhsScalar, RhsScalar, SpecType>;
@@ -1637,16 +1742,35 @@ int StorageSize(const Matrix<Scalar>& matrix) {
   return sizeof(Scalar) * FlatSize(matrix.layout);
 }
 
-template <typename Scalar>
-void MakeColdData(int num_copies, const Matrix<Scalar>& matrix,
-                  std::vector<Scalar>* cold_data) {
-  const int flatsize = FlatSize(matrix.layout);
-  cold_data->resize(num_copies * flatsize);
-  for (int i = 0; i < num_copies; i++) {
-    memcpy(cold_data->data() + i * flatsize, matrix.data.get(),
-           sizeof(Scalar) * flatsize);
+// Helper that replicates a buffer and gives out pointers to the replicas.
+// This is useful when one wants to traverse data so that it is cold in cache.
+// By having a sufficiently large value of num_repeats, one can ensure that the
+// working set covered by the replicas is greater than the cache size.
+template <typename T>
+class RepeatedBuffer {
+ public:
+  RepeatedBuffer() = default;
+  void Init(const T* elems, std::size_t num_elems, int num_repeats) {
+    buffers_.clear();
+    allocator_.FreeAll();
+    for (int i = 0; i < num_repeats; i++) {
+      T* p;
+      allocator_.Allocate(num_elems, &p);
+      memcpy(p, elems, num_elems * sizeof(T));
+      buffers_.push_back(p);
+    }
   }
-}
+  T* Next() {
+    T* ret = buffers_[current_];
+    current_ = (current_ + 1) % buffers_.size();
+    return ret;
+  }
+
+ private:
+  Allocator allocator_;
+  std::vector<T*> buffers_;
+  int current_ = 0;
+};
 
 template <typename LhsScalar, typename RhsScalar, typename SpecType>
 void TestSet<LhsScalar, RhsScalar, SpecType>::Benchmark(
@@ -1654,14 +1778,20 @@ void TestSet<LhsScalar, RhsScalar, SpecType>::Benchmark(
   using DstScalar = typename SpecType::DstScalar;
 
   const bool cold = getenv("RUY_BENCHMARK_COLD");
-  const LhsScalar* orig_lhs_data = nullptr;
-  const RhsScalar* orig_rhs_data = nullptr;
-  DstScalar* orig_dst_data = nullptr;
-  std::vector<LhsScalar> cold_lhs_data;
-  std::vector<RhsScalar> cold_rhs_data;
-  std::vector<DstScalar> cold_dst_data;
+  LhsScalar* orig_lhs_data = lhs.matrix.data.get();
+  RhsScalar* orig_rhs_data = rhs.matrix.data.get();
+  DstScalar* orig_dst_data = result->storage_matrix.matrix.data.get();
+  void* orig_prepacked_lhs_data = result->prepacked_lhs.data;
+  void* orig_prepacked_rhs_data = result->prepacked_rhs.data;
+
   int num_matmul_sets = 0;
 
+  RepeatedBuffer<LhsScalar> cold_lhs;
+  RepeatedBuffer<RhsScalar> cold_rhs;
+  RepeatedBuffer<DstScalar> cold_dst;
+  RepeatedBuffer<char> cold_prepacked_lhs;
+  RepeatedBuffer<char> cold_prepacked_rhs;
+
   if (cold) {
     const int kWorkingSetSize = 100 << 20;
     const int each_matmul_set_size = StorageSize(lhs.matrix) +
@@ -1670,14 +1800,21 @@ void TestSet<LhsScalar, RhsScalar, SpecType>::Benchmark(
     num_matmul_sets =
         (kWorkingSetSize + each_matmul_set_size - 1) / each_matmul_set_size;
 
-    MakeColdData(num_matmul_sets, lhs.matrix, &cold_lhs_data);
-    MakeColdData(num_matmul_sets, rhs.matrix, &cold_rhs_data);
-    MakeColdData(num_matmul_sets, result->storage_matrix.matrix,
-                 &cold_dst_data);
-
-    orig_lhs_data = lhs.matrix.data.get();
-    orig_rhs_data = rhs.matrix.data.get();
-    orig_dst_data = result->storage_matrix.matrix.data.get();
+    cold_lhs.Init(lhs.matrix.data.get(), FlatSize(lhs.matrix.layout),
+                  num_matmul_sets);
+    cold_rhs.Init(rhs.matrix.data.get(), FlatSize(rhs.matrix.layout),
+                  num_matmul_sets);
+    cold_dst.Init(result->storage_matrix.matrix.data.get(),
+                  FlatSize(result->storage_matrix.matrix.layout),
+                  num_matmul_sets);
+    if (benchmark_prepack_lhs) {
+      cold_prepacked_lhs.Init(static_cast<char*>(result->prepacked_lhs.data),
+                              result->prepacked_lhs.data_size, num_matmul_sets);
+    }
+    if (benchmark_prepack_rhs) {
+      cold_prepacked_rhs.Init(static_cast<char*>(result->prepacked_rhs.data),
+                              result->prepacked_rhs.data_size, num_matmul_sets);
+    }
   }
   int kRepeats = 4;
   const double kBenchmarkMinSecs = 0.5;
@@ -1696,7 +1833,6 @@ void TestSet<LhsScalar, RhsScalar, SpecType>::Benchmark(
 #endif
 
   double latency = std::numeric_limits<double>::infinity();
-  int data_index = 0;
   const bool record_pmu = getenv("RUY_BENCHMARK_PMU");
   for (int repeat = 0; repeat < kRepeats; repeat++) {
     PmuEvents pmu_events;
@@ -1710,16 +1846,14 @@ void TestSet<LhsScalar, RhsScalar, SpecType>::Benchmark(
     while (ToSeconds(t - time_start) < kBenchmarkMinSecs) {
       for (int i = 0; i < iters_at_a_time; i++) {
         if (cold) {
-          lhs.matrix.data =
-              cold_lhs_data.data() + data_index * FlatSize(lhs.matrix.layout);
-          rhs.matrix.data =
-              cold_rhs_data.data() + data_index * FlatSize(rhs.matrix.layout);
-          result->storage_matrix.matrix.data =
-              cold_dst_data.data() +
-              data_index * FlatSize(result->storage_matrix.matrix.layout);
-          data_index++;
-          if (data_index == num_matmul_sets) {
-            data_index = 0;
+          lhs.matrix.data = cold_lhs.Next();
+          rhs.matrix.data = cold_rhs.Next();
+          result->storage_matrix.matrix.data = cold_dst.Next();
+          if (benchmark_prepack_lhs) {
+            result->prepacked_lhs.data = cold_prepacked_lhs.Next();
+          }
+          if (benchmark_prepack_rhs) {
+            result->prepacked_rhs.data = cold_prepacked_rhs.Next();
           }
         }
         EvalResult(result);
@@ -1755,19 +1889,21 @@ void TestSet<LhsScalar, RhsScalar, SpecType>::Benchmark(
     lhs.matrix.data = orig_lhs_data;
     rhs.matrix.data = orig_rhs_data;
     memcpy(orig_dst_data, result->storage_matrix.matrix.data.get(),
-           sizeof(DstScalar) * FlatSize(result->storage_matrix.matrix.layout));
+           StorageSize(result->storage_matrix.matrix));
     result->storage_matrix.matrix.data = orig_dst_data;
+    result->prepacked_lhs.data = orig_prepacked_lhs_data;
+    result->prepacked_rhs.data = orig_prepacked_rhs_data;
   }
 }
 
 template <typename LhsScalar, typename RhsScalar, typename SpecType>
 void TestSet<LhsScalar, RhsScalar, SpecType>::Eval() {
-  RUY_CHECK(life_stage == LifeStage::kHasResultPaths);
+  RUY_CHECK(life_stage == LifeStage::kHasPrepackedMatrices);
   for (auto& result : results) {
     if (benchmark) {
-      Benchmark(&result);
+      Benchmark(result.get());
     } else {
-      EvalResult(&result);
+      EvalResult(result.get());
     }
   }
   life_stage = LifeStage::kEvaluated;
@@ -1795,16 +1931,16 @@ template <typename LhsScalar, typename RhsScalar, typename SpecType>
 void TestSet<LhsScalar, RhsScalar, SpecType>::VerifyTestResults() const {
   const int depth = lhs.matrix.layout.cols;
   for (int i = 0; i < results.size() - 1; i++) {
-    if (!Agree(results[i], results[i + 1], depth)) {
+    if (!Agree(*results[i], *results[i + 1], depth)) {
       std::string paths_in_agreement;
-      paths_in_agreement.append(PathName(results[0]));
+      paths_in_agreement.append(PathName(*results[0]));
       for (int j = 1; j <= i; j++) {
         paths_in_agreement.append(", ");
-        paths_in_agreement.append(PathName(results[j]));
+        paths_in_agreement.append(PathName(*results[j]));
       }
       ErrorAnalysis error_analysis;
       AnalyzeTestError(*this, i + 1, &error_analysis);
-      std::cerr << "Error: path (" << PathName(results[i + 1])
+      std::cerr << "Error: path (" << PathName(*results[i + 1])
                 << ") disagrees with the other paths (" << paths_in_agreement
                 << "), which agree with each other." << std::endl;
       std::cerr << "Shape: rows = " << rows << ", cols = " << cols
@@ -1833,12 +1969,12 @@ void TestSet<LhsScalar, RhsScalar, SpecType>::VerifyTestResults() const {
       std::cerr << "Bad value : " << error_analysis.first_error_bad_value
                 << std::endl;
       std::cerr << "Region of Good result matrix around first error:\n\n"
-                << DumpRegion(results[0].storage_matrix.matrix,
+                << DumpRegion(results[0]->storage_matrix.matrix,
                               error_analysis.row_of_first_error,
                               error_analysis.col_of_first_error)
                 << std::endl;
       std::cerr << "Region of Bad result matrix around first error:\n\n"
-                << DumpRegion(results[i + 1].storage_matrix.matrix,
+                << DumpRegion(results[i + 1]->storage_matrix.matrix,
                               error_analysis.row_of_first_error,
                               error_analysis.col_of_first_error)
                 << std::endl;
@@ -1852,12 +1988,12 @@ void TestSet<LhsScalar, RhsScalar, SpecType>::VerifyNonTrivial() const {
   if (getenv("QUICK_BENCHMARK")) {
     return;
   }
-  if (results.front().path != Path::kReference) {
+  if (results.front()->path != Path::kReference) {
     return;
   }
   Context context;
   context.SetRuntimeEnabledPaths(Path::kReference);
-  const auto& dst_storage = results.front().storage_matrix;
+  const auto& dst_storage = results.front()->storage_matrix;
   const Matrix<DstScalar>& dst = dst_storage.matrix;
   Matrix<DstScalar> unclamped_dst;
   unclamped_dst.layout = dst.layout;
diff --git a/tensorflow/lite/experimental/swift/README.md b/tensorflow/lite/experimental/swift/README.md
index 2ed07ba2a60..3e5badf448a 100644
--- a/tensorflow/lite/experimental/swift/README.md
+++ b/tensorflow/lite/experimental/swift/README.md
@@ -5,7 +5,7 @@ solution for Swift developers. It enables low-latency inference of on-device
 machine learning models with a small binary size and fast performance supporting
 hardware acceleration.
 
-## Getting Started
+## Build TensorFlow with iOS support
 
 To build the Swift TensorFlow Lite library on Apple platforms,
 [install from source](https://www.tensorflow.org/install/source#setup_for_linux_and_macos)
@@ -19,9 +19,25 @@ python configure.py
 
 Follow the prompts and when asked to build TensorFlow with iOS support, enter `y`.
 
-### Bazel
+### CocoaPods developers
 
-In your `BUILD` file, add the `TensorFlowLite` dependency:
+Add the TensorFlow Lite pod to your `Podfile`:
+
+```ruby
+pod 'TensorFlowLiteSwift'
+```
+
+Then, run `pod install`.
+
+In your Swift files, import the module:
+
+```swift
+import TensorFlowLite
+```
+
+### Bazel developers
+
+In your `BUILD` file, add the `TensorFlowLite` dependency to your target:
 
 ```python
 swift_library(
@@ -49,12 +65,12 @@ Build the `TensorFlowLiteTests` target:
 bazel test tensorflow/lite/experimental/swift:TensorFlowLiteTests --swiftcopt=-enable-testing
 ```
 
-Note that `--swiftcopt=-enable-testing` is required for optimized builds (`-c opt`).
+Note: `--swiftcopt=-enable-testing` is required for optimized builds (`-c opt`).
 
-### Tulsi
+#### Generate the Xcode project using Tulsi
 
-Open the `TensorFlowLite.tulsiproj` using the
-[TulsiApp](https://github.com/bazelbuild/tulsi)
+Open the `//tensorflow/lite/experimental/swift/TensorFlowLite.tulsiproj` using
+the [TulsiApp](https://github.com/bazelbuild/tulsi)
 or by running the
 [`generate_xcodeproj.sh`](https://github.com/bazelbuild/tulsi/blob/master/src/tools/generate_xcodeproj.sh)
 script from the root `tensorflow` directory:
@@ -62,19 +78,3 @@ script from the root `tensorflow` directory:
 ```shell
 generate_xcodeproj.sh --genconfig tensorflow/lite/experimental/swift/TensorFlowLite.tulsiproj:TensorFlowLite --outputfolder ~/path/to/generated/TensorFlowLite.xcodeproj
 ```
-
-### CocoaPods
-
-Add the following to your `Podfile`:
-
-```ruby
-pod 'TensorFlowLiteSwift'
-```
-
-Then, run `pod install`.
-
-In your Swift files, import the module:
-
-```swift
-import TensorFlowLite
-```
diff --git a/tensorflow/lite/experimental/swift/Sources/Tensor.swift b/tensorflow/lite/experimental/swift/Sources/Tensor.swift
index 161c42be7eb..0509cc17a39 100644
--- a/tensorflow/lite/experimental/swift/Sources/Tensor.swift
+++ b/tensorflow/lite/experimental/swift/Sources/Tensor.swift
@@ -58,17 +58,19 @@ public struct Tensor {
 
 /// Supported TensorFlow Lite tensor data types.
 public enum TensorDataType: Equatable {
-  /// 32-bit single precision floating point tensor data type.
+  /// 32-bit single precision floating point.
   case float32
-  /// 8-bit unsigned integer tensor data type.
+  /// 16-bit half precision floating point.
+  case float16
+  /// 8-bit unsigned integer.
   case uInt8
-  /// 16-bit signed integer tensor data type.
+  /// 16-bit signed integer.
   case int16
-  /// 32-bit signed integer tensor data type.
+  /// 32-bit signed integer.
   case int32
-  /// 64-bit signed integer tensor data type.
+  /// 64-bit signed integer.
   case int64
-  /// Boolean tensor data type.
+  /// Boolean.
   case bool
 
   /// Creates a new tensor data type from the given `TFL_Type` or `nil` if the data type is
@@ -79,6 +81,8 @@ public enum TensorDataType: Equatable {
     switch type {
     case kTfLiteFloat32:
       self = .float32
+    case kTfLiteFloat16:
+      self = .float16
     case kTfLiteUInt8:
       self = .uInt8
     case kTfLiteInt16:
diff --git a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec
index 6e83da0e2ba..3210ccc06c0 100644
--- a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec
+++ b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec
@@ -1,5 +1,3 @@
-# Run `pod lib lint TensorFlowLiteSwift.podspec` to ensure this is a valid spec.
-
 Pod::Spec.new do |s|
   s.name             = 'TensorFlowLiteSwift'
   s.version          = '0.2.0'
diff --git a/tensorflow/lite/experimental/writer/enum_mapping.h b/tensorflow/lite/experimental/writer/enum_mapping.h
index 4556f7463f7..77f7b26cbc2 100644
--- a/tensorflow/lite/experimental/writer/enum_mapping.h
+++ b/tensorflow/lite/experimental/writer/enum_mapping.h
@@ -62,6 +62,8 @@ inline TensorType TfLiteTypeToSchemaType(TfLiteType type) {
       return TensorType_FLOAT32;  // TODO(aselle): Consider an error.
     case kTfLiteFloat32:
       return TensorType_FLOAT32;
+    case kTfLiteFloat16:
+      return TensorType_FLOAT16;
     case kTfLiteInt32:
       return TensorType_INT32;
     case kTfLiteUInt8:
diff --git a/tensorflow/lite/g3doc/convert/index.md b/tensorflow/lite/g3doc/convert/index.md
index 45802fe3fa2..f9c6d9f6cfe 100644
--- a/tensorflow/lite/g3doc/convert/index.md
+++ b/tensorflow/lite/g3doc/convert/index.md
@@ -1,15 +1,23 @@
 # TensorFlow Lite converter
 
-TensorFlow Lite uses the optimized
-[FlatBuffer](https://google.github.io/flatbuffers/) format to represent graphs.
-Therefore, a TensorFlow model
-([protocol buffer](https://developers.google.com/protocol-buffers/)) needs to be
-converted into a `FlatBuffer` file before deploying to clients.
+The TensorFlow Lite converter is used to convert TensorFlow models into an
+optimized [FlatBuffer](https://google.github.io/flatbuffers/) format, so that
+they can be used by the TensorFlow Lite interpreter.
 
 Note: This page contains documentation on the converter API for TensorFlow 1.x.
 The API for TensorFlow 2.0 is available
 [here](https://www.tensorflow.org/lite/r2/convert/).
 
+## FlatBuffers
+
+FlatBuffer is an efficient open-source cross-platform serialization library. It
+is similar to
+[protocol buffers](https://developers.google.com/protocol-buffers), with the
+distinction that FlatBuffers do not need a parsing/unpacking step to a secondary
+representation before data can be accessed, avoiding per-object memory
+allocation. The code footprint of FlatBuffers is an order of magnitude smaller
+than protocol buffers.
+
 ## From model training to device deployment
 
 The TensorFlow Lite converter generates a TensorFlow Lite
@@ -20,14 +28,13 @@ The converter supports the following input formats:
 
 *   [SavedModels](https://www.tensorflow.org/guide/saved_model#using_savedmodel_with_estimators)
 *   Frozen `GraphDef`: Models generated by
-    [freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py).
+    [freeze_graph.py](https://www.tensorflow.org/code/tensorflow/python/tools/freeze_graph.py).
 *   `tf.keras` HDF5 models.
 *   Any model taken from a `tf.Session` (Python API only).
 
-The TensorFlow Lite `FlatBuffer` file is then deployed to a client device
-(generally a mobile or embedded device), and the TensorFlow Lite interpreter
-uses the compressed model for on-device inference. This conversion process is
-shown in the diagram below:
+The TensorFlow Lite `FlatBuffer` file is then deployed to a client device, and
+the TensorFlow Lite interpreter uses the compressed model for on-device
+inference. This conversion process is shown in the diagram below:
 
 ![TFLite converter workflow](../images/convert/workflow.svg)
 
diff --git a/tensorflow/lite/g3doc/guide/get_started.md b/tensorflow/lite/g3doc/guide/get_started.md
index 2e42c95cfad..e20dc08d0ca 100644
--- a/tensorflow/lite/g3doc/guide/get_started.md
+++ b/tensorflow/lite/g3doc/guide/get_started.md
@@ -1,270 +1,286 @@
 # Get started with TensorFlow Lite
 
-Using a TensorFlow Lite model in your mobile app requires multiple
-considerations: you must choose a pre-trained or custom model, convert the model
-to a TensorFLow Lite format, and finally, integrate the model in your app.
+TensorFlow Lite provides all the tools you need to convert and run TensorFlow
+models on mobile, embedded, and IoT devices. The following guide walks through
+each step of the developer workflow and provides links to further instructions.
 
 ## 1. Choose a model
 
-Depending on the use case, you can choose one of the popular open-sourced models,
-such as *InceptionV3* or *MobileNets*, and re-train these models with a custom
-data set or even build your own custom model.
+<a id="1_choose_a_model"></a>
+
+TensorFlow Lite allows you to run TensorFlow models on a wide range of devices.
+A TensorFlow model is a data structure that contains the logic and knowledge of
+a machine learning network trained to solve a particular problem.
+
+There are many ways to obtain a TensorFlow model, from using pre-trained models
+to training your own. To use a model with TensorFlow Lite it must be converted
+into a special format. This is explained in section 2,
+[Convert the model](#2_convert_the_model_format).
+
+Note: Not all TensorFlow models will work with TensorFlow Lite, since the
+interpreter supports a limited subset of TensorFlow operations. See section 2,
+[Convert the model](#2_convert_the_model_format) to learn about compatibility.
 
 ### Use a pre-trained model
 
-[MobileNets](https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html)
-is a family of mobile-first computer vision models for TensorFlow designed to
-effectively maximize accuracy, while taking into consideration the restricted
-resources for on-device or embedded applications. MobileNets are small,
-low-latency, low-power models parameterized to meet the resource constraints for
-a variety of uses. They can be used for classification, detection, embeddings, and
-segmentation—similar to other popular large scale models, such as
-[Inception](https://arxiv.org/pdf/1602.07261.pdf). Google provides 16 pre-trained
-[ImageNet](http://www.image-net.org/challenges/LSVRC/) classification checkpoints
-for MobileNets that can be used in mobile projects of all sizes.
+The TensorFlow Lite team provides a set of pre-trained models that solve a
+variety of machine learning problems. These models have been converted to work
+with TensorFlow Lite and are ready to use in your applications.
 
-[Inception-v3](https://arxiv.org/abs/1512.00567) is an image recognition model
-that achieves fairly high accuracy recognizing general objects with 1000 classes,
-for example, "Zebra", "Dalmatian", and "Dishwasher". The model extracts general
-features from input images using a convolutional neural network and classifies
-them based on those features with fully-connected and softmax layers.
+The pre-trained models include:
 
-[On Device Smart Reply](https://research.googleblog.com/2017/02/on-device-machine-intelligence.html)
-is an on-device model that provides one-touch replies for incoming text messages
-by suggesting contextually relevant messages. The model is built specifically for
-memory constrained devices, such as watches and phones, and has been successfully
-used in Smart Replies on Android Wear. Currently, this model is Android-specific.
+*   [Image classification](../models/image_classification/overview.md)
+*   [Object detection](../models/object_detection/overview.md)
+*   [Smart reply](../models/smart_reply/overview.md)
+*   [Pose estimation](../models/pose_estimation/overview.md)
+*   [Segmentation](../models/segmentation/overview.md)
 
-These pre-trained models are [available for download](hosted_models.md).
+See our full list of pre-trained models in [Models](../models).
 
-### Re-train Inception-V3 or MobileNet for a custom data set
+#### Models from other sources
 
-These pre-trained models were trained on the *ImageNet* data set which contains
-1000 predefined classes. If these classes are not sufficient for your use case,
-the model will need to be re-trained. This technique is called
-*transfer learning* and starts with a model that has been already trained on a
-problem, then retrains the model on a similar problem. Deep learning from
-scratch can take days, but transfer learning is fairly quick. In order to do
-this, you need to generate a custom data set labeled with the relevant classes.
+There are many other places you can obtain pre-trained TensorFlow models,
+including [TensorFlow Hub](https://www.tensorflow.org/hub). In most cases, these
+models will not be provided in the TensorFlow Lite format, and you'll have to
+[convert](#2_convert_the_model_format) them before use.
 
-The [TensorFlow for Poets](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/)
-codelab walks through the re-training process step-by-step. The code supports
-both floating point and quantized inference.
+### Re-train a model (transfer learning)
+
+Transfer learning allows you to take a trained model and re-train it to perform
+another task. For example, an
+[image classification](../models/image_classification/overview.md) model could
+be retrained to recognize new categories of image. Re-training takes less time
+and requires less data than training a model from scratch.
+
+You can use transfer learning to customize pre-trained models to your
+application. Learn how to perform transfer learning in the
+<a href="https://codelabs.developers.google.com/codelabs/recognize-flowers-with-tensorflow-on-android">Recognize
+flowers with TensorFlow</a> codelab.
 
 ### Train a custom model
 
-A developer may choose to train a custom model using Tensorflow (see the
-[TensorFlow tutorials](https://www.tensorflow.org/tutorials/) for examples of building and training
-models). If you have already written a model, the first step is to export this
-to a `tf.GraphDef` file. This is required because some formats do not store the
-model structure outside the code, and we must communicate with other parts of
-the framework. See
-[Exporting the Inference Graph](https://www.tensorflow.org/tutorials/keras/save_and_restore_models#save_the_entire_model)
-to create file for the custom model.
+If you have designed and trained your own TensorFlow model, or you have trained
+a model obtained from another source, you should convert it to the TensorFlow
+Lite format before use.
 
-TensorFlow Lite currently supports a subset of TensorFlow operators. Refer to
-the [TensorFlow Lite & TensorFlow Compatibility Guide](ops_compatibility.md)
-for supported operators and their usage. This set of operators will continue to
-grow in future Tensorflow Lite releases.
+## 2. Convert the model
 
-## 2. Convert the model format
+<a id="2_convert_the_model_format"></a>
 
-The [TensorFlow Lite Converter](../convert/index.md) accepts the following file
-formats:
+TensorFlow Lite is designed to execute models efficiently on devices. Some of
+this efficiency comes from the use of a special format for storing models.
+TensorFlow models must be converted into this format before they can be used by
+TensorFlow Lite.
 
-*   `SavedModel` — A `GraphDef` and checkpoint with a signature that labels
-    input and output arguments to a model. See the documentation for converting
-    SavedModels using [Python](../convert/python_api.md#basic_savedmodel) or using
-    the [command line](../convert/cmdline_examples.md#savedmodel).
-*   `tf.keras` - A HDF5 file containing a model with weights and input and
-    output arguments generated by `tf.Keras`. See the documentation for
-    converting HDF5 models using
-    [Python](../convert/python_api.md#basic_keras_file) or using the
-    [command line](../convert/cmdline_examples.md#keras).
-*   `frozen tf.GraphDef` — A subclass of `tf.GraphDef` that does not contain
-    variables. A `GraphDef` can be converted to a `frozen GraphDef` by taking a
-    checkpoint and a `GraphDef`, and converting each variable into a constant
-    using the value retrieved from the checkpoint. Instructions on converting a
-    `tf.GraphDef` to a TensorFlow Lite model are described in the next
-    subsection.
+Converting models reduces their file size and introduces optimizations that do
+not affect accuracy. Developers can opt to further reduce file size and increase
+speed of execution in exchange for some trade-offs. You can use the TensorFlow
+Lite converter to choose which optimizations to apply.
 
-### Converting a tf.GraphDef
+TensorFlow Lite supports a limited subset of TensorFlow operations, so not all
+models can be converted. See [Ops compatibility](#ops-compatibility) for more
+information.
 
-TensorFlow models may be saved as a .pb or .pbtxt `tf.GraphDef` file. In order
-to convert the `tf.GraphDef` file to TensorFlow Lite, the model must first be
-frozen. This process involves several file formats including the `frozen
-GraphDef`:
+### TensorFlow Lite converter
 
-*   `tf.GraphDef` (.pb or .pbtxt) — A protobuf that represents the TensorFlow
-    training or computation graph. It contains operators, tensors, and variables
-    definitions.
-*   *checkpoint* (.ckpt) — Serialized variables from a TensorFlow graph. Since
-    this does not contain a graph structure, it cannot be interpreted by itself.
-*   *TensorFlow Lite model* (.tflite) — A serialized
-    [FlatBuffer](https://google.github.io/flatbuffers/) that contains TensorFlow
-    Lite operators and tensors for the TensorFlow Lite interpreter.
+The [TensorFlow Lite converter](../convert) is a tool that converts trained
+TensorFlow models into the TensorFlow Lite format. It can also introduce
+optimizations, which are covered in section 4,
+[Optimize your model](#4_optimize_your_model_optional).
 
-You must have checkpoints that contain trained weights. The `tf.GraphDef` file
-only contains the structure of the graph. The process of merging the checkpoint
-values with the graph structure is called *freezing the graph*.
+The converter is available as a Python API. The following example shows a
+TensorFlow `SavedModel` being converted into the TensorFlow Lite format:
 
-`tf.GraphDef` and checkpoint files for MobileNet models are available
-[here](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md).
+```python
+import tensorflow as tf
 
-To freeze the graph, use the following command (changing the arguments):
-
-```
-freeze_graph --input_graph=/tmp/mobilenet_v1_224.pb \
-  --input_checkpoint=/tmp/checkpoints/mobilenet-10202.ckpt \
-  --input_binary=true \
-  --output_graph=/tmp/frozen_mobilenet_v1_224.pb \
-  --output_node_names=MobileNetV1/Predictions/Reshape_1
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+tflite_model = converter.convert()
+open("converted_model.tflite", "wb").write(tflite_model)
 ```
 
-Set the `input_binary` flag to `True` when reading a binary protobuf, a `.pb`
-file. Set to `False` for a `.pbtxt` file.
+You can [convert TensorFlow 2.0 models](../r2/convert) in a similar way.
 
-Set `input_graph` and `input_checkpoint` to the respective filenames. The
-`output_node_names` may not be obvious outside of the code that built the model.
-The easiest way to find them is to visualize the graph, either with
-[TensorBoard](https://www.tensorflow.org/guide/summaries_and_tensorboard) or
-`graphviz`.
+The converter can also be used from the
+[command line](../convert/cmdline_examples), but the Python API is recommended.
 
-The frozen `GraphDef` is now ready for conversion to the `FlatBuffer` format
-(.tflite) for use on Android or iOS devices. For Android, the TensorFlow Lite
-Converter tool supports both float and quantized models. To convert the frozen
-`GraphDef` to the .tflite format use a command similar to the following:
+### Options
 
-```
-tflite_convert \
-  --output_file=/tmp/mobilenet_v1_1.0_224.tflite \
-  --graph_def_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
-  --input_arrays=input \
-  --output_arrays=MobilenetV1/Predictions/Reshape_1
-```
+The converter can convert from a variety of input types.
 
-The
-[frozen_graph.pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz)
-file used here is available for download. Setting the `input_array` and
-`output_array` arguments is not straightforward. The easiest way to find these
-values is to explore the graph using
-[TensorBoard](https://www.tensorflow.org/guide/summaries_and_tensorboard). Reuse
-the arguments for specifying the output nodes for inference in the
-`freeze_graph` step.
+When [converting TensorFlow 1.x models](../convert/python_api), these are:
 
-### Full converter reference
+*   [SavedModel directories](https://www.tensorflow.org/alpha/guide/saved_model)
+*   Frozen GraphDef (models generated by
+    [freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py))
+*   [Keras](https://keras.io) HDF5 models
+*   Models taken from a `tf.Session`
 
-The [TensorFlow Lite Converter](../convert/index.md) can be
-[Python](../convert/python_api.md) or from the
-[command line](../convert/cmdline_examples.md). This allows you to integrate the
-conversion step into the model design workflow, ensuring the model is easy to
-convert to a mobile inference graph.
+When [converting TensorFlow 2.x models](../r2/convert/python_api), these are:
+
+*   [SavedModel directories](https://www.tensorflow.org/alpha/guide/saved_model)
+*   [`tf.keras` models](https://www.tensorflow.org/alpha/guide/keras/overview)
+*   [Concrete functions](../r2/convert/concrete_function.md)
+
+The converter can be configured to apply various optimizations that can improve
+performance or reduce file size. This is covered in section 4,
+[Optimize your model](#4_optimize_your_model_optional).
 
 ### Ops compatibility
 
-Refer to the [ops compatibility guide](ops_compatibility.md) for
-troubleshooting help, and if that doesn't help, please
-[file an issue](https://github.com/tensorflow/tensorflow/issues).
+TensorFlow Lite currently supports a [limited subset](ops_compatibility.md) of
+TensorFlow operations. The long term goal is for all TensorFlow operations to be
+supported.
 
-### Graph Visualization tool
+If the model you wish to convert contains unsupported operations, you can use
+[TensorFlow Select](ops_select.md) to include operations from TensorFlow. This
+will result in a larger binary being deployed to devices.
 
-The [development repo](https://github.com/tensorflow/tensorflow) contains a tool
-to visualize TensorFlow Lite models after conversion. To build the
-[visualize.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/visualize.py)
-tool:
+## 3. Run inference with the model
 
-```sh
-bazel run tensorflow/lite/tools:visualize -- model.tflite model_viz.html
+<a id="3_use_the_tensorflow_lite_model_for_inference_in_a_mobile_app"></a>
+
+*Inference* is the process of running data through a model to obtain
+predictions. It requires a model, an interpreter, and input data.
+
+### TensorFlow Lite interpreter
+
+The [TensorFlow Lite interpreter](inference.md) is a library that takes a model
+file, executes the operations it defines on input data, and provides access to
+the output.
+
+The interpreter works across multiple platforms and provides a simple API for
+running TensorFlow Lite models from Java, Swift, Objective-C, C++, and Python.
+
+The following code shows the interpreter being invoked from Java:
+
+```java
+try (Interpreter interpreter = new Interpreter(tensorflow_lite_model_file)) {
+  interpreter.run(input, output);
+}
 ```
 
-This generates an interactive HTML page listing subgraphs, operations, and a
-graph visualization.
+### GPU acceleration and Delegates
 
-## 3. Use the TensorFlow Lite model for inference in a mobile app
+Some devices provide hardware acceleration for machine learning operations. For
+example, most mobile phones have GPUs, which can perform floating point matrix
+operations faster than a CPU.
 
-After completing the prior steps, you should now have a `.tflite` model file.
+The speed-up can be substantial. For example, a MobileNet v1 image
+classification model runs 5.5x faster on a Pixel 3 phone when GPU acceleration
+is used.
 
-### Android
+The TensorFlow Lite interpreter can be configured with
+[Delegates](../performance/delegates.md) to make use of hardware acceleration on
+different devices. The [GPU Delegate](../performance/gpu.md) allows the
+interpreter to run appropriate operations on the device's GPU.
 
-Since Android apps are written in Java and the core TensorFlow library is in C++,
-a JNI library is provided as an interface. This is only meant for inference—it
-provides the ability to load a graph, set up inputs, and run the model to
-calculate outputs.
+The following code shows the GPU Delegate being used from Java:
 
-The open source Android demo app uses the JNI interface and is available
-[on GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/java/demo/app).
-You can also download a
-[prebuilt APK](http://download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk).
-See the <a href="./android.md">Android demo</a> guide for details.
+```java
+GpuDelegate delegate = new GpuDelegate();
+Interpreter.Options options = (new Interpreter.Options()).addDelegate(delegate);
+Interpreter interpreter = new Interpreter(tensorflow_lite_model_file, options);
+try {
+  interpreter.run(input, output);
+}
+```
 
-The <a href="./android.md">Android mobile</a> guide has instructions for
-installing TensorFlow on Android and setting up `bazel` and Android Studio.
+To add support for new hardware accelerators you can
+[define your own delegate](../performance/delegates.md#how_to_add_a_delegate).
 
-### iOS
+### Android and iOS
 
-To integrate a TensorFlow model in an iOS app, see the
-[TensorFlow Lite for iOS](ios.md) guide and <a href="./ios.md">iOS demo</a>
-guide.
+The TensorFlow Lite interpreter is easy to use from both major mobile platforms.
+To get started, explore the [Android quickstart](android.md) and
+[iOS quickstart](ios.md) guides.
+[Example applications](https://www.tensorflow.org/lite/examples) are available
+for both platforms.
 
-#### Core ML support
+To obtain the required libraries, Android developers should use the
+[TensorFlow Lite AAR](android.md#use_the_tensorflow_lite_aar_from_jcenter). iOS
+developers should use the
+[CocoaPods for Swift or Objective-C](ios.md#add_tensorflow_lite_to_your_swift_or_objective-c_project).
 
-Core ML is a machine learning framework used in Apple products. In addition to
-using Tensorflow Lite models directly in your applications, you can convert
-trained Tensorflow models to the
-[CoreML](https://developer.apple.com/machine-learning/) format for use on Apple
-devices. To use the converter, refer to the
-[Tensorflow-CoreML converter documentation](https://github.com/tf-coreml/tf-coreml).
+### Linux
 
-### ARM32 and ARM64 Linux
+Embedded Linux is an important platform for deploying machine learning. We
+provide build instructions for both [Raspberry Pi](build_rpi.md) and
+[Arm64-based boards](build_arm64.md) such as Odroid C2, Pine64, and NanoPi.
 
-Compile Tensorflow Lite for a Raspberry Pi by following the
-[RPi build instructions](build_rpi.md) Compile Tensorflow Lite for a generic aarch64
-board such as Odroid C2, Pine64, NanoPi, and others by following the
-[ARM64 Linux build instructions](build_arm64.md) This compiles a static
-library file (`.a`) used to build your app. There are plans for Python bindings
-and a demo app.
+### Microcontrollers
 
-## 4. Optimize your model (optional)
+[TensorFlow Lite for Microcontrollers](../microcontrollers/overview.md) is an
+experimental port of TensorFlow Lite aimed at microcontrollers and other devices
+with only kilobytes of memory.
 
-There are two options. If you plan to run on CPU, we recommend that you quantize
-your weights and activation tensors. If the hardware is available, another
-option is to run on GPU for massively parallelizable workloads.
+### Operations
+
+If your model requires TensorFlow operations that are not yet implemented in
+TensorFlow Lite, you can use [TensorFlow Select](ops_select.md) to use them in
+your model. You'll need to build a custom version of the interpreter that
+includes the TensorFlow operations.
+
+You can use [Custom operators](ops_custom.md) to write your own operations, or
+port new operations into TensorFlow Lite.
+
+[Operator versions](ops_version.md) allows you to add new functionalities and
+parameters into existing operations.
+
+## 4. Optimize your model
+
+<a id="4_optimize_your_model_optional"></a>
+
+TensorFlow Lite provides tools to optimize the size and performance of your
+models, often with minimal impact on accuracy. Optimized models may require
+slightly more complex training, conversion, or integration.
+
+Machine learning optimization is an evolving field, and TensorFlow Lite's
+[Model Optimization Toolkit](#model-optimization-toolkit) is continually growing
+as new techniques are developed.
+
+### Performance
+
+The goal of model optimization is to reach the ideal balance of performance,
+model size, and accuracy on a given device.
+[Performance best practices](../performance/best_practices.md) can help guide
+you through this process.
 
 ### Quantization
-Compress your model size by lowering the precision of the parameters (i.e.
-neural network weights) from their training-time 32-bit floating-point
-representations into much smaller and efficient 8-bit integer ones.
 
-This will execute the heaviest computations fast in lower precision, but the
-most sensitive ones with higher precision, thus typically resulting in little to
-no final accuracy losses for the task, yet a significant speed-up over pure
-floating-point execution.
+By reducing the precision of values and operations within a model, quantization
+can reduce both the size of model and the time required for inference. For many
+models, there is only a minimal loss of accuracy.
 
-The post-training quantization technique is integrated into the TensorFlow Lite
-conversion tool. Getting started is easy: after building your TensorFlow model,
-simply enable the ‘post_training_quantize’ flag in the TensorFlow Lite
-conversion tool. Assuming that the saved model is stored in saved_model_dir, the
-quantized tflite flatbuffer can be generated in command line:
+The TensorFlow Lite converter makes it easy to quantize TensorFlow models. The
+following Python code quantizes a `SavedModel` and saves it to disk:
+
+```python
+import tensorflow as tf
 
-```
 converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
 converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]
 tflite_quant_model = converter.convert()
+open("converted_model.tflite", "wb").write(tflite_quantized_model)
 ```
 
-Read the full documentation [here](../performance/post_training_quantization.md)
-and see a tutorial
-[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tutorials/post_training_quant.ipynb).
+To learn more about quantization, see
+[Post-training quantization](../performance/post_training_quantization.md).
 
-### GPU
-Run on GPU GPUs are designed to have high throughput for massively
-parallelizable workloads. Thus, they are well-suited for deep neural nets, which
-consist of a huge number of operators, each working on some input tensor(s) that
-can be easily divided into smaller workloads and carried out in parallel,
-typically resulting in lower latency.
+### Model Optimization Toolkit
 
-Another benefit with GPU inference is its power efficiency. GPUs carry out the
-computations in a very efficient and optimized manner, so that they consume less
-power and generate less heat than when the same task is run on CPUs.
+The [Model Optimization Toolkit](../performance/model_optimization.md) is a set
+of tools and techniques designed to make it easy for developers to optimize
+their models. Many of the techniques can be applied to all TensorFlow models and
+are not specific to TensorFlow Lite, but they are especially valuable when
+running inference on devices with limited resources.
 
-Read the tutorial [here](../performance/gpu.md) and full documentation [here](../performance/gpu_advanced.md).
+## Next steps
+
+Now that you're familiar with TensorFlow Lite, explore some of the following
+resources:
+
+*   If you're a mobile developer, visit [Android quickstart](android.md) or
+    [iOS quickstart](ios.md).
+*   Explore our [pre-trained models](../models).
+*   Try our [example apps](https://www.tensorflow.org/lite/examples).
diff --git a/tensorflow/lite/g3doc/guide/hosted_models.md b/tensorflow/lite/g3doc/guide/hosted_models.md
index 69f196782ea..323d31ba897 100644
--- a/tensorflow/lite/g3doc/guide/hosted_models.md
+++ b/tensorflow/lite/g3doc/guide/hosted_models.md
@@ -39,7 +39,7 @@ Mobilenet_V1_1.0_128_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tf
 Mobilenet_V1_1.0_160_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_160_quant.tgz)  | 4.3 Mb     | 66.9%          | 86.7%          | 37.4 ms
 Mobilenet_V1_1.0_192_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_192_quant.tgz)  | 4.3 Mb     | 69.1%          | 88.1%          | 51.9 ms
 Mobilenet_V1_1.0_224_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz)  | 4.3 Mb     | 70.0%          | 89.0%          | 70.2 ms
-Mobilenet_V2_1.0_224_quant  | [paper](https://arxiv.org/abs/1806.08342), [tflite&pb](http://download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz)              | 3.4 Mb     | 70.8%          | 89.9%          | 80.3 ms
+Mobilenet_V2_1.0_224_quant  | [paper](https://arxiv.org/abs/1806.08342), [tflite&pb](http://download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz)              | 3.4 Mb     | 70.8%          | 89.9%          | 53.4 ms
 Inception_V1_quant          | [paper](https://arxiv.org/abs/1409.4842), [tflite&pb](http://download.tensorflow.org/models/inception_v1_224_quant_20181026.tgz)                          | 6.4 Mb     | 70.1%          | 89.8%          | 154.5 ms
 Inception_V2_quant          | [paper](https://arxiv.org/abs/1512.00567), [tflite&pb](http://download.tensorflow.org/models/inception_v2_224_quant_20181026.tgz)                         | 11 Mb      | 73.5%          | 91.4%          | 235.0 ms
 Inception_V3_quant          | [paper](https://arxiv.org/abs/1806.08342),[tflite&pb](http://download.tensorflow.org/models/tflite_11_05_08/inception_v3_quant.tgz)                       | 23 Mb      | 77.5%          | 93.7%          | 637 ms
diff --git a/tensorflow/lite/g3doc/guide/index.md b/tensorflow/lite/g3doc/guide/index.md
index 288f7a07576..2475c7e1132 100644
--- a/tensorflow/lite/g3doc/guide/index.md
+++ b/tensorflow/lite/g3doc/guide/index.md
@@ -1,202 +1,121 @@
-
 # TensorFlow Lite guide
 
-TensorFlow Lite is TensorFlow’s lightweight solution for mobile and embedded
-devices. It enables on-device machine learning inference with low latency and a
-small binary size. TensorFlow Lite also supports hardware acceleration with the
-[Android Neural Networks
-API](https://developer.android.com/ndk/guides/neuralnetworks/index.html).
+TensorFlow Lite is a set of tools to help developers run TensorFlow models on
+mobile, embedded, and IoT devices. It enables on-device machine learning
+inference with low latency and a small binary size.
 
-TensorFlow Lite uses many techniques for achieving low latency such as
-optimizing the kernels for mobile apps, pre-fused activations, and quantized
-kernels that allow smaller and faster (fixed-point math) models.
+TensorFlow Lite consists of two main components:
 
-Most of our TensorFlow Lite documentation is [on
-GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite)
-for the time being.
+-   The [TensorFlow Lite interpreter](inference.md), which runs specially
+    optimized models on many different hardware types, including mobile phones,
+    embedded Linux devices, and microcontrollers.
+-   The [TensorFlow Lite converter](../convert/index.md), which converts
+    TensorFlow models into an efficient form for use by the interpreter, and can
+    introduce optimizations to improve binary size and performance.
 
-## What does TensorFlow Lite contain?
+### Machine learning at the edge
 
-TensorFlow Lite supports a set of core operators, both quantized and
-float, which have been tuned for mobile platforms. They incorporate pre-fused
-activations and biases to further enhance performance and quantized
-accuracy. Additionally, TensorFlow Lite also supports using custom operations in
-models.
+TensorFlow Lite is designed to make it easy to perform machine learning on
+devices, "at the edge" of the network, instead of sending data back and forth
+from a server. For developers, performing machine learning on-device can help
+improve:
 
-TensorFlow Lite defines a new model file format, based on
-[FlatBuffers](https://google.github.io/flatbuffers/). FlatBuffers is an
-efficient open-source cross-platform serialization library. It is similar to
-[protocol buffers](https://developers.google.com/protocol-buffers/?hl=en), but
-the primary difference is that FlatBuffers does not need a parsing/unpacking
-step to a secondary representation before you can access data, often coupled
-with per-object memory allocation. Also, the code footprint of FlatBuffers is an
-order of magnitude smaller than protocol buffers.
+*   *Latency:* there's no round-trip to a server
+*   *Privacy:* no data needs to leave the device
+*   *Connectivity:* an Internet connection isn't required
+*   *Power consumption:* network connections are power hungry
 
-TensorFlow Lite has a new mobile-optimized interpreter, which has the key goals
-of keeping apps lean and fast. The interpreter uses a static graph ordering and
-a custom (less-dynamic) memory allocator to ensure minimal load, initialization,
-and execution latency.
+TensorFlow Lite works with a huge range of devices, from tiny microcontrollers
+to powerful mobile phones.
 
-TensorFlow Lite provides an interface to leverage hardware acceleration, if
-available on the device. It does so via the
-[Android Neural Networks API](https://developer.android.com/ndk/guides/neuralnetworks/index.html),
-available on Android 8.1 (API level 27) and higher.
+Key Point: The TensorFlow Lite binary is smaller than 300KB when all supported
+operators are linked, and less than 200KB when using only the operators needed
+for supporting the common image classification models InceptionV3 and MobileNet.
 
-## Why do we need a new mobile-specific library?
+## Get started
 
-Machine Learning is changing the computing paradigm, and we see an emerging
-trend of new use cases on mobile and embedded devices. Consumer expectations are
-also trending toward natural, human-like interactions with their devices, driven
-by the camera and voice interaction models.
+To begin working with TensorFlow Lite, visit [Get started](get_started.md).
 
-There are several factors which are fueling interest in this domain:
+## Key features
 
-- Innovation at the silicon layer is enabling new possibilities for hardware
-  acceleration, and frameworks such as the Android Neural Networks API make it
-  easy to leverage these.
+*   *[Interpreter](inference.md) tuned for on-device ML*, supporting a set of
+    core operators that are optimized for on-device applications, and with a
+    small binary size.
+*   *Diverse platform support*, covering [Android](android.md) and [iOS](ios.md)
+    devices, embedded Linux, and microcontrollers, making use of platform APIs
+    for accelerated inference.
+*   *APIs for multiple languages* including Java, Swift, Objective-C, C++, and
+    Python.
+*   *High performance*, with [hardware acceleration](../performance/gpu.md) on
+    supported devices, device-optimized kernels, and
+    [pre-fused activations and biases](ops_compatibility.md).
+*   *Model optimization tools*, including
+    [quantization](../performance/post_training_quantization.md), that can
+    reduce size and increase performance of models without sacrificing accuracy.
+*   *Efficient model format*, using a [FlatBuffer](../convert/index.md) that is
+    optimized for small size and portability.
+*   *[Pre-trained models](../models)* for common machine learning tasks that can
+    be customized to your application.
+*   *[Samples and tutorials](https://www.tensorflow.org/examples)* that show you
+    how to deploy machine learning models on supported platforms.
 
-- Recent advances in real-time computer-vision and spoken language understanding
-  have led to mobile-optimized benchmark models being open sourced
-  (e.g. MobileNets, SqueezeNet).
+## Development workflow
 
-- Widely-available smart appliances create new possibilities for
-  on-device intelligence.
+The workflow for using TensorFlow Lite involves the following steps:
 
-- Interest in stronger user data privacy paradigms where user data does not need
-  to leave the mobile device.
+1.  **Pick a model**
 
-- Ability to serve ‘offline’ use cases, where the device does not need to be
-  connected to a network.
+    Bring your own TensorFlow model, find a model online, or pick a model from
+    our [Pre-trained models](../models) to drop in or retrain.
 
-We believe the next wave of machine learning applications will have significant
-processing on mobile and embedded devices.
+1.  **Convert the model**
 
-## TensorFlow Lite highlights
+    If you're using a custom model, use the
+    [TensorFlow Lite converter](../convert/index.md) and a few lines of Python
+    to convert it to the TensorFlow Lite format.
 
-TensorFlow Lite provides:
+1.  **Deploy to your device**
 
-- A set of core operators, both quantized and float, many of which have been
-  tuned for mobile platforms.  These can be used to create and run custom
-  models.  Developers can also write their own custom operators and use them in
-  models.
+    Run your model on-device with the
+    [TensorFlow Lite interpreter](inference.md), with APIs in many languages.
 
-- A new [FlatBuffers](https://google.github.io/flatbuffers/)-based
-  model file format.
+1.  **Optimize your model**
 
-- On-device interpreter with kernels optimized for faster execution on mobile.
+    Use our [Model Optimization Toolkit](../performance/model_optimization.md)
+    to reduce your model's size and increase its efficiency with minimal impact
+    on accuracy.
 
-- TensorFlow converter to convert TensorFlow-trained models to the TensorFlow
-  Lite format.
+To learn more about using TensorFlow Lite in your project, see
+[Get started](get_started.md).
 
-- Smaller in size: TensorFlow Lite is smaller than 300KB when all supported
-  operators are linked and less than 200KB when using only the operators needed
-  for supporting InceptionV3 and Mobilenet.
+## Technical constraints
 
-- **Pre-tested models:**
+TensorFlow Lite plans to provide high performance on-device inference for any
+TensorFlow model. However, the TensorFlow Lite interpreter currently supports a
+limited subset of TensorFlow operators that have been optimized for on-device
+use. This means that some models require additional steps to work with
+TensorFlow Lite.
 
-    All of the following models are guaranteed to work out of the box:
+To learn which operators are available, see
+[Operator compatibility](ops_compatibility.md).
 
-    - Inception V3, a popular model for detecting the dominant objects
-      present in an image.
+If your model uses operators that are not yet supported by TensorFlow Lite
+interpreter, you can use [TensorFlow Select](ops_select.md) to include
+TensorFlow operations in your TensorFlow Lite build. However, this will lead to
+an increased binary size.
 
-    - [MobileNets](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md),
-      a family of mobile-first computer vision models designed to effectively
-      maximize accuracy while being mindful of the restricted resources for an
-      on-device or embedded application. They are small, low-latency, low-power
-      models parameterized to meet the resource constraints of a variety of use
-      cases. They can be built upon for classification, detection, embeddings
-      and segmentation. MobileNet models are smaller but [lower in
-      accuracy](https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html)
-      than Inception V3.
+TensorFlow Lite does not currently support on-device training, but it is in our
+[Roadmap](roadmap.md), along with other planned improvements.
 
-    - On Device Smart Reply, an on-device model which provides one-touch
-      replies for an incoming text message by suggesting contextually relevant
-      messages. The model was built specifically for memory constrained devices
-      such as watches & phones and it has been successfully used to surface
-      [Smart Replies on Android
-      Wear](https://research.googleblog.com/2017/02/on-device-machine-intelligence.html)
-      to all first-party and third-party apps.
+## Next steps
 
-    Also see the complete list of
-    [TensorFlow Lite's supported models](hosted_models.md),
-    including the model sizes, performance numbers, and downloadable model files.
+Want to keep learning about TensorFlow Lite? Here are some next steps:
 
-- Quantized versions of the MobileNet model, which runs faster than the
-  non-quantized (float) version on CPU.
-
-- New Android demo app to illustrate the use of TensorFlow Lite with a quantized
-  MobileNet model for object classification.
-
-- Java and C++ API support
-
-
-## Getting Started
-
-We recommend you try out TensorFlow Lite with the pre-tested models indicated
-above. If you have an existing model, you will need to test whether your model
-is compatible with both the converter and the supported operator set.  To test
-your model, see the
-[documentation on GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite).
-
-### Retrain Inception-V3 or MobileNet for a custom data set
-
-The pre-trained models mentioned above have been trained on the ImageNet data
-set, which consists of 1000 predefined classes. If those classes are not
-relevant or useful for your use case, you will need to retrain those
-models. This technique is called transfer learning, which starts with a model
-that has been already trained on a problem and will then be retrained on a
-similar problem. Deep learning from scratch can take days, but transfer learning
-can be done fairly quickly. In order to do this, you'll need to generate your
-custom data set labeled with the relevant classes.
-
-The [TensorFlow for Poets](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/)
-codelab walks through this process step-by-step. The retraining code supports
-retraining for both floating point and quantized inference.
-
-## TensorFlow Lite Architecture
-
-The following diagram shows the architectural design of TensorFlow Lite:
-
-<img src="https://www.tensorflow.org/images/tflite-architecture.jpg"
-     alt="TensorFlow Lite architecture diagram"
-     style="max-width:600px;">
-
-Starting with a trained TensorFlow model on disk, you'll convert that model to
-the TensorFlow Lite file format (`.tflite`) using the TensorFlow Lite
-Converter. Then you can use that converted file in your mobile application.
-
-Deploying the TensorFlow Lite model file uses:
-
-- Java API: A convenience wrapper around the C++ API on Android.
-
-- C++ API: Loads the TensorFlow Lite Model File and invokes the Interpreter. The
-  same library is available on both Android and iOS.
-
-- Interpreter: Executes the model using a set of kernels. The interpreter
-  supports selective kernel loading; without kernels it is only 100KB, and 300KB
-  with all the kernels loaded. This is a significant reduction from the 1.5M
-  required by TensorFlow Mobile.
-
-- On select Android devices, the Interpreter will use the Android Neural
-  Networks API for hardware acceleration, or default to CPU execution if none
-  are available.
-
-You can also implement custom kernels using the C++ API that can be used by the
-Interpreter.
-
-## Future Work
-
-In future releases, TensorFlow Lite will support more models and built-in
-operators, contain performance improvements for both fixed point and floating
-point models, improvements to the tools to enable easier developer workflows and
-support for other smaller devices and more. As we continue development, we hope
-that TensorFlow Lite will greatly simplify the developer experience of targeting
-a model for small devices.
-
-Future plans include using specialized machine learning hardware to get the best
-possible performance for a particular model on a particular device.
-
-## Next Steps
-
-The TensorFlow Lite [GitHub repository](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite).
-contains additional docs, code samples, and demo applications.
+*   Visit [Get started](get_started.md) to walk through the process of using
+    TensorFlow Lite.
+*   If you're a mobile developer, visit [Android quickstart](android.md) or
+    [iOS quickstart](ios.md).
+*   Learn about
+    [TensorFlow Lite for Microcontrollers](../microcontrollers/overview.md).
+*   Explore our [pre-trained models](../models).
+*   Try our [example apps](https://www.tensorflow.org/lite/examples).
diff --git a/tensorflow/lite/g3doc/guide/inference.md b/tensorflow/lite/g3doc/guide/inference.md
index b0107ece0b1..353a656740e 100644
--- a/tensorflow/lite/g3doc/guide/inference.md
+++ b/tensorflow/lite/g3doc/guide/inference.md
@@ -1,16 +1,15 @@
 # TensorFlow Lite inference
 
-[TOC]
+The term *inference* refers to the process of executing a TensorFlow Lite model
+on-device in order to make predictions based on input data. Inference is the
+final step in using the model on-device.
 
-## Overview
+Inference for TensorFlow Lite models is run through an interpreter. The
+TensorFlow Lite interpreter is designed to be lean and fast. The interpreter
+uses a static graph ordering and a custom (less-dynamic) memory allocator to
+ensure minimal load, initialization, and execution latency.
 
-TensorFlow Lite inference is the process of executing a TensorFlow Lite
-model on-device and extracting meaningful results from it. Inference is the
-final step in using the model on-device in the
-[architecture](index.md#tensorflow_lite_architecture).
-
-Inference for TensorFlow Lite models is run through an interpreter. This
-document outlines the various APIs for the interpreter along with the
+This document outlines the various APIs for the interpreter, along with the
 [supported platforms](#supported-platforms).
 
 ### Important Concepts
@@ -43,19 +42,27 @@ TensorFlow Lite inference on device typically follows the following steps.
    present it to their user.
 
 ### Supported Platforms
+
 TensorFlow inference APIs are provided for most common mobile/embedded platforms
 such as Android, iOS and Linux.
 
 #### Android
+
 On Android, TensorFlow Lite inference can be performed using either Java or C++
 APIs. The Java APIs provide convenience and can be used directly within your
-Android Activity classes. The C++ APIs on the other hand may offer more
-flexibility and speed, but may require writing JNI wrappers to move data between
-Java and C++ layers. You can find an example [here](android.md).
+Android Activity classes. The C++ APIs offer more flexibility and speed, but may
+require writing JNI wrappers to move data between Java and C++ layers.
+
+Visit the [Android quickstart](android.md) for a tutorial and example code.
 
 #### iOS
-TensorFlow Lite provides Swift/Objective C++ APIs for inference on iOS. An
-example can be found [here](ios.md).
+
+TensorFlow Lite provides native iOS libraries written in
+[Swift](https://www.tensorflow.org/code/tensorflow/lite/experimental/swift)
+and
+[Objective-C](https://www.tensorflow.org/code/tensorflow/lite/experimental/objc).
+
+Visit the [iOS quickstart](ios.md) for a tutorial and example code.
 
 #### Linux
 On Linux platforms such as [Raspberry Pi](build_rpi.md), TensorFlow Lite C++
diff --git a/tensorflow/lite/g3doc/guide/ios.md b/tensorflow/lite/g3doc/guide/ios.md
index 77aa64ca6fd..4c84dbdb220 100644
--- a/tensorflow/lite/g3doc/guide/ios.md
+++ b/tensorflow/lite/g3doc/guide/ios.md
@@ -30,12 +30,12 @@ To get started quickly writing your own iOS code, we recommend using our
 [Swift image classification example](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/ios)
 as a starting point.
 
-The sections below walk you through the steps for adding TensorFlow Lite Swift
-or Objective-C to your project:
+The sections below demonstrate how to add TensorFlow Lite Swift or Objective-C
+to your project:
 
 ### CocoaPods developers
 
-In your `Podfile`, add the TensorFlow Lite pod. Then, run `pod install`:
+In your `Podfile`, add the TensorFlow Lite pod. Then, run `pod install`.
 
 #### Swift
 
@@ -52,7 +52,7 @@ pod 'TensorFlowLiteObjC'
 
 ### Bazel developers
 
-In your `BUILD` file, add the `TensorFlowLite` dependency.
+In your `BUILD` file, add the `TensorFlowLite` dependency to your target.
 
 #### Swift
 
@@ -74,7 +74,7 @@ objc_library(
 )
 ```
 
-### Importing the library
+### Import the library
 
 For Swift files, import the TensorFlow Lite module:
 
@@ -88,12 +88,11 @@ For Objective-C files, import the umbrella header:
 #import "TFLTensorFlowLite.h"
 ```
 
-Or, the TensorFlow Lite module:
+Or, the module if you set `CLANG_ENABLE_MODULES = YES` in your Xcode project:
 
 ```objectivec
 @import TFLTensorFlowLite;
 ```
 
-Note: If importing the Objective-C TensorFlow Lite module, `CLANG_ENABLE_MODULES`
-must be set to `YES`. Additionally, for CocoaPods developers, `use_frameworks!`
-must be specified in your `Podfile`.
+Note: For CocoaPods developers who want to import the Objective-C TensorFlow
+Lite module, you must also include `use_frameworks!` in your `Podfile`.
diff --git a/tensorflow/lite/g3doc/models/image_classification/overview.md b/tensorflow/lite/g3doc/models/image_classification/overview.md
index 844934e467e..d4046c95cfb 100644
--- a/tensorflow/lite/g3doc/models/image_classification/overview.md
+++ b/tensorflow/lite/g3doc/models/image_classification/overview.md
@@ -280,5 +280,5 @@ trees in the original training data. To do this, you will need a set of training
 images for each of the new labels you wish to train.
 
 Learn how to perform transfer learning in the
-<a href="https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/">TensorFlow
-for Poets</a> codelab.
+<a href="https://codelabs.developers.google.com/codelabs/recognize-flowers-with-tensorflow-on-android/#0">Recognize
+flowers with TensorFlow</a> codelab.
diff --git a/tensorflow/lite/g3doc/models/smart_reply/overview.md b/tensorflow/lite/g3doc/models/smart_reply/overview.md
index 20c359ec9ff..b2363adcf48 100644
--- a/tensorflow/lite/g3doc/models/smart_reply/overview.md
+++ b/tensorflow/lite/g3doc/models/smart_reply/overview.md
@@ -13,12 +13,15 @@ starter model and labels</a>
 
 ### Sample application
 
-We have provided a pre-built APK that demonstrates the smart reply model on
-Android.
+There is a TensorFlow Lite sample application that demonstrates the smart reply
+model on Android.
 
-Go to the
-<a href="https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/smartreply/g3doc">GitHub
-page</a> for instructions and list of supported ops and functionalities.
+<a class="button button-primary" href="https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/smartreply">View
+Android example</a>
+
+Read the
+[GitHub page](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/smartreply/g3doc)
+to learn how the app works.
 
 ## How it works
 
diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index ce99351f7b7..9edef3751dd 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -26,10 +26,14 @@ limitations under the License.
 #include "tensorflow/lite/graph_info.h"
 #include "tensorflow/lite/memory_planner.h"
 #include "tensorflow/lite/minimal_logging.h"
-#include "tensorflow/lite/profiling/profiler.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/util.h"
 
+// TODO(b/132087118): move static_assert to c_api_internal when compiled with
+// C++.
+static_assert(sizeof(TfLiteFloat16) == sizeof(uint16_t),
+              "Float 16 type must be 16 bits.");
+
 namespace tflite {
 
 namespace {
@@ -256,11 +260,11 @@ TfLiteStatus Interpreter::GetBufferHandle(int tensor_index,
   return kTfLiteOk;
 }
 
-void Interpreter::SetProfiler(profiling::Profiler* profiler) {
+void Interpreter::SetProfiler(Profiler* profiler) {
   for (auto& subgraph : subgraphs_) subgraph->SetProfiler(profiler);
 }
 
-profiling::Profiler* Interpreter::GetProfiler() {
+Profiler* Interpreter::GetProfiler() {
   return primary_subgraph().GetProfiler();
 }
 
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index 806b66c12a0..2d72eea588a 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -25,9 +25,9 @@ limitations under the License.
 #include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/profiler.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/memory_planner.h"
-#include "tensorflow/lite/profiling/profiler.h"
 #include "tensorflow/lite/stderr_reporter.h"
 
 namespace tflite {
@@ -74,6 +74,10 @@ constexpr TfLiteType typeToTfLiteType<string>() {
   return kTfLiteString;
 }
 
+template <>
+constexpr TfLiteType typeToTfLiteType<TfLiteFloat16>() {
+  return kTfLiteFloat16;
+}
 // An interpreter for a graph of nodes that input and output from tensors.
 // Each node of the graph processes a set of input tensors and produces a
 // set of output Tensors. All inputs/output tensors are referenced by index.
@@ -402,9 +406,14 @@ class Interpreter {
                                TfLiteBufferHandle* buffer_handle,
                                TfLiteDelegate** delegate);
 
-  void SetProfiler(profiling::Profiler* profiler);
+  // Sets the profiler to tracing execution. The caller retains ownership
+  // of the profiler and must ensure its validity.
+  // WARNING: This is an experimental API and subject to change.
+  void SetProfiler(Profiler* profiler);
 
-  profiling::Profiler* GetProfiler();
+  // Gets the profiler used for op tracing.
+  // WARNING: This is an experimental API and subject to change.
+  Profiler* GetProfiler();
 
   // The default capacity of `tensors_` vector.
   static constexpr int kTensorsReservedCapacity = 128;
diff --git a/tensorflow/lite/interpreter_test.cc b/tensorflow/lite/interpreter_test.cc
index 78c3d4ddc7f..0c0c32b4eed 100644
--- a/tensorflow/lite/interpreter_test.cc
+++ b/tensorflow/lite/interpreter_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -165,7 +166,7 @@ TEST(BasicInterpreter, CheckAllocate) {
   } cases[] = {
       {kTfLiteFloat32, sizeof(float)}, {kTfLiteInt32, sizeof(int32_t)},
       {kTfLiteUInt8, sizeof(uint8_t)}, {kTfLiteInt64, sizeof(int64_t)},
-      {kTfLiteInt16, sizeof(int16_t)},
+      {kTfLiteInt16, sizeof(int16_t)}, {kTfLiteFloat16, sizeof(TfLiteFloat16)},
   };
 
   for (auto test : cases) {
@@ -238,6 +239,8 @@ TEST(BasicInterpreter, CheckResize) {
   const uint8_t uint8s[] = {3, 4};
   const int64_t int64s[] = {6, -7};
   const int16_t int16s[] = {8, -9};
+  const Eigen::half float16s[] = {Eigen::half_impl::float_to_half_rtne(-3.f),
+                                  Eigen::half_impl::float_to_half_rtne(-4.f)};
 
   struct {
     TfLiteType type;
@@ -249,6 +252,8 @@ TEST(BasicInterpreter, CheckResize) {
       {kTfLiteUInt8, sizeof(uint8_t), reinterpret_cast<const char*>(uint8s)},
       {kTfLiteInt64, sizeof(int64_t), reinterpret_cast<const char*>(int64s)},
       {kTfLiteInt16, sizeof(int16_t), reinterpret_cast<const char*>(int16s)},
+      {kTfLiteFloat16, sizeof(TfLiteFloat16),
+       reinterpret_cast<const char*>(float16s)},
   };
 
   for (auto test : cases) {
@@ -283,10 +288,8 @@ TEST(BasicInterpreter, CheckResize) {
 TEST(BasicInterpreter, CheckAlignment) {
   struct {
     TfLiteType type;
-  } cases[] = {
-      {kTfLiteFloat32}, {kTfLiteInt32}, {kTfLiteUInt8},
-      {kTfLiteInt64},   {kTfLiteInt16},
-  };
+  } cases[] = {{kTfLiteFloat32}, {kTfLiteInt32}, {kTfLiteUInt8},
+               {kTfLiteInt64},   {kTfLiteInt16}, {kTfLiteFloat16}};
 
   for (auto test : cases) {
     Interpreter interpreter;
diff --git a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierFloatMobileNet.java b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierFloatMobileNet.java
index c87ffff8f6c..f72ce6ad105 100644
--- a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierFloatMobileNet.java
+++ b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierFloatMobileNet.java
@@ -21,6 +21,11 @@ import java.io.IOException;
 /** This classifier works with the float MobileNet model. */
 public class ImageClassifierFloatMobileNet extends ImageClassifier {
 
+  /** The mobile net requires additional normalization of the used input. */
+  private static final float IMAGE_MEAN = 127.5f;
+
+  private static final float IMAGE_STD = 127.5f;
+
   /**
    * An array to hold inference results, to be feed into Tensorflow Lite as outputs. This isn't part
    * of the super class, because we need a primitive array here.
@@ -67,9 +72,9 @@ public class ImageClassifierFloatMobileNet extends ImageClassifier {
 
   @Override
   protected void addPixelValue(int pixelValue) {
-    imgData.putFloat(((pixelValue >> 16) & 0xFF) / 255.f);
-    imgData.putFloat(((pixelValue >> 8) & 0xFF) / 255.f);
-    imgData.putFloat((pixelValue & 0xFF) / 255.f);
+    imgData.putFloat((((pixelValue >> 16) & 0xFF) - IMAGE_MEAN) / IMAGE_STD);
+    imgData.putFloat((((pixelValue >> 8) & 0xFF) - IMAGE_MEAN) / IMAGE_STD);
+    imgData.putFloat(((pixelValue & 0xFF) - IMAGE_MEAN) / IMAGE_STD);
   }
 
   @Override
diff --git a/tensorflow/lite/java/src/main/native/BUILD b/tensorflow/lite/java/src/main/native/BUILD
index 52194e86db3..e5ec209b2e7 100644
--- a/tensorflow/lite/java/src/main/native/BUILD
+++ b/tensorflow/lite/java/src/main/native/BUILD
@@ -11,16 +11,13 @@ licenses(["notice"])  # Apache 2.0
 cc_library(
     name = "native_framework_only",
     srcs = [
-        "exception_jni.cc",
+        "jni_utils.cc",
         "nativeinterpreterwrapper_jni.cc",
         "tensor_jni.cc",
         "tensorflow_lite_jni.cc",
     ],
     hdrs = [
-        "exception_jni.h",
-        "nativeinterpreterwrapper_jni.h",
-        "tensor_jni.h",
-        "tensorflow_lite_jni.h",
+        "jni_utils.h",
     ],
     copts = tflite_copts(),
     linkopts = [
@@ -42,9 +39,6 @@ cc_library(
     srcs = [
         "init_tensorflow_jni.cc",
     ],
-    hdrs = [
-        "init_tensorflow_jni.h",
-    ],
     copts = tflite_copts(),
     deps = [
         "//tensorflow/lite/java/jni",
diff --git a/tensorflow/lite/java/src/main/native/init_tensorflow_jni.cc b/tensorflow/lite/java/src/main/native/init_tensorflow_jni.cc
index 1fa9d1f50e5..1cda86ec65e 100644
--- a/tensorflow/lite/java/src/main/native/init_tensorflow_jni.cc
+++ b/tensorflow/lite/java/src/main/native/init_tensorflow_jni.cc
@@ -13,10 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/java/src/main/native/init_tensorflow_jni.h"
+#include <jni.h>
+
 #include "tensorflow/lite/testing/init_tensorflow.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
 JNIEXPORT void JNICALL Java_org_tensorflow_lite_TensorFlowLite_initTensorFlow(
     JNIEnv* env, jclass clazz) {
   ::tflite::InitTensorFlow();
 }
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
diff --git a/tensorflow/lite/java/src/main/native/init_tensorflow_jni.h b/tensorflow/lite/java/src/main/native/init_tensorflow_jni.h
deleted file mode 100644
index 1454d6d4633..00000000000
--- a/tensorflow/lite/java/src/main/native/init_tensorflow_jni.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_INIT_TENSORFLOW_JNI_H_
-#define TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_INIT_TENSORFLOW_JNI_H_
-
-#include <jni.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-/*
- *  Class:     org_tensorflow_lite_TensorFlowLite
- *  Method:    initTensorFlow
- *  Signature: ()V
- */
-JNIEXPORT void JNICALL Java_org_tensorflow_lite_TensorFlowLite_initTensorFlow(
-    JNIEnv* env, jclass clazz);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
-
-#endif  // TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_INIT_TENSORFLOW_JNI_H_
diff --git a/tensorflow/lite/java/src/main/native/exception_jni.cc b/tensorflow/lite/java/src/main/native/jni_utils.cc
similarity index 89%
rename from tensorflow/lite/java/src/main/native/exception_jni.cc
rename to tensorflow/lite/java/src/main/native/jni_utils.cc
index 74217d6b682..0bec91b94c7 100644
--- a/tensorflow/lite/java/src/main/native/exception_jni.cc
+++ b/tensorflow/lite/java/src/main/native/jni_utils.cc
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/lite/java/src/main/native/jni_utils.h"
+
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 
-#include "tensorflow/lite/java/src/main/native/exception_jni.h"
-
 const char kIllegalArgumentException[] = "java/lang/IllegalArgumentException";
 const char kIllegalStateException[] = "java/lang/IllegalStateException";
 const char kNullPointerException[] = "java/lang/NullPointerException";
@@ -26,7 +26,10 @@ const char kIndexOutOfBoundsException[] = "java/lang/IndexOutOfBoundsException";
 const char kUnsupportedOperationException[] =
     "java/lang/UnsupportedOperationException";
 
-void throwException(JNIEnv* env, const char* clazz, const char* fmt, ...) {
+namespace tflite {
+namespace jni {
+
+void ThrowException(JNIEnv* env, const char* clazz, const char* fmt, ...) {
   va_list args;
   va_start(args, fmt);
   const size_t max_msg_len = 512;
@@ -45,7 +48,7 @@ void throwException(JNIEnv* env, const char* clazz, const char* fmt, ...) {
 BufferErrorReporter::BufferErrorReporter(JNIEnv* env, int limit) {
   buffer_ = new char[limit];
   if (!buffer_) {
-    throwException(env, kNullPointerException,
+    ThrowException(env, kNullPointerException,
                    "Internal error: Malloc of BufferErrorReporter to hold %d "
                    "char failed.",
                    limit);
@@ -68,3 +71,6 @@ int BufferErrorReporter::Report(const char* format, va_list args) {
 }
 
 const char* BufferErrorReporter::CachedErrorMessage() { return buffer_; }
+
+}  // namespace jni
+}  // namespace tflite
diff --git a/tensorflow/lite/java/src/main/native/exception_jni.h b/tensorflow/lite/java/src/main/native/jni_utils.h
similarity index 71%
rename from tensorflow/lite/java/src/main/native/exception_jni.h
rename to tensorflow/lite/java/src/main/native/jni_utils.h
index ebd91e875b5..cb0cdf5b49f 100644
--- a/tensorflow/lite/java/src/main/native/exception_jni.h
+++ b/tensorflow/lite/java/src/main/native/jni_utils.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,15 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_EXCEPTION_JNI_H_
-#define TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_EXCEPTION_JNI_H_
+#ifndef TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_JNI_UTILS_H_
+#define TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_JNI_UTILS_H_
 
 #include <jni.h>
-#include "tensorflow/lite/error_reporter.h"
 
-#ifdef __cplusplus
-extern "C" {
-#endif
+#include "tensorflow/lite/error_reporter.h"
 
 extern const char kIllegalArgumentException[];
 extern const char kIllegalStateException[];
@@ -29,9 +26,12 @@ extern const char kNullPointerException[];
 extern const char kIndexOutOfBoundsException[];
 extern const char kUnsupportedOperationException[];
 
-void throwException(JNIEnv* env, const char* clazz, const char* fmt, ...);
+namespace tflite {
+namespace jni {
 
-class BufferErrorReporter : public tflite::ErrorReporter {
+void ThrowException(JNIEnv* env, const char* clazz, const char* fmt, ...);
+
+class BufferErrorReporter : public ErrorReporter {
  public:
   BufferErrorReporter(JNIEnv* env, int limit);
   virtual ~BufferErrorReporter();
@@ -44,7 +44,7 @@ class BufferErrorReporter : public tflite::ErrorReporter {
   int end_idx_ = 0;
 };
 
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
-#endif  // TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_EXCEPTION_JNI_H_
+}  // namespace jni
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_JNI_UTILS_H_
diff --git a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index d4916006b62..c2abbab1240 100644
--- a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -13,12 +13,30 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.h"
+#include <jni.h>
+#include <stdio.h>
+#include <time.h>
+
+#include <vector>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/java/src/main/native/jni_utils.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+// This is to be provided at link-time by a library.
+extern std::unique_ptr<OpResolver> CreateOpResolver();
+}  // namespace tflite
+
+using tflite::jni::BufferErrorReporter;
+using tflite::jni::ThrowException;
+
 namespace {
 
 tflite::Interpreter* convertLongToInterpreter(JNIEnv* env, jlong handle) {
   if (handle == 0) {
-    throwException(env, kIllegalArgumentException,
+    ThrowException(env, kIllegalArgumentException,
                    "Internal error: Invalid handle to Interpreter.");
     return nullptr;
   }
@@ -27,7 +45,7 @@ tflite::Interpreter* convertLongToInterpreter(JNIEnv* env, jlong handle) {
 
 tflite::FlatBufferModel* convertLongToModel(JNIEnv* env, jlong handle) {
   if (handle == 0) {
-    throwException(env, kIllegalArgumentException,
+    ThrowException(env, kIllegalArgumentException,
                    "Internal error: Invalid handle to model.");
     return nullptr;
   }
@@ -36,7 +54,7 @@ tflite::FlatBufferModel* convertLongToModel(JNIEnv* env, jlong handle) {
 
 BufferErrorReporter* convertLongToErrorReporter(JNIEnv* env, jlong handle) {
   if (handle == 0) {
-    throwException(env, kIllegalArgumentException,
+    ThrowException(env, kIllegalArgumentException,
                    "Internal error: Invalid handle to ErrorReporter.");
     return nullptr;
   }
@@ -45,7 +63,7 @@ BufferErrorReporter* convertLongToErrorReporter(JNIEnv* env, jlong handle) {
 
 TfLiteDelegate* convertLongToDelegate(JNIEnv* env, jlong handle) {
   if (handle == 0) {
-    throwException(env, kIllegalArgumentException,
+    ThrowException(env, kIllegalArgumentException,
                    "Internal error: Invalid handle to delegate.");
     return nullptr;
   }
@@ -57,7 +75,7 @@ std::vector<int> convertJIntArrayToVector(JNIEnv* env, jintArray inputs) {
   std::vector<int> outputs(size, 0);
   jint* ptr = env->GetIntArrayElements(inputs, nullptr);
   if (ptr == nullptr) {
-    throwException(env, kIllegalArgumentException,
+    ThrowException(env, kIllegalArgumentException,
                    "Array has empty dimensions.");
     return {};
   }
@@ -105,7 +123,7 @@ bool AreDimsDifferent(JNIEnv* env, TfLiteTensor* tensor, jintArray dims) {
   int num_dims = static_cast<int>(env->GetArrayLength(dims));
   jint* ptr = env->GetIntArrayElements(dims, nullptr);
   if (ptr == nullptr) {
-    throwException(env, kIllegalArgumentException,
+    ThrowException(env, kIllegalArgumentException,
                    "Empty dimensions of input array.");
     return true;
   }
@@ -132,6 +150,10 @@ bool VerifyModel(const void* buf, size_t len) {
 
 }  // namespace
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 JNIEXPORT jobjectArray JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputNames(JNIEnv* env,
                                                                 jclass clazz,
@@ -140,7 +162,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputNames(JNIEnv* env,
   if (interpreter == nullptr) return nullptr;
   jclass string_class = env->FindClass("java/lang/String");
   if (string_class == nullptr) {
-    throwException(env, kUnsupportedOperationException,
+    ThrowException(env, kUnsupportedOperationException,
                    "Internal error: Can not find java/lang/String class to get "
                    "input names.");
     return nullptr;
@@ -165,7 +187,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_allocateTensors(
   if (error_reporter == nullptr) return;
 
   if (interpreter->AllocateTensors() != kTfLiteOk) {
-    throwException(
+    ThrowException(
         env, kIllegalStateException,
         "Internal error: Unexpected failure when preparing tensor allocations:"
         " %s",
@@ -215,7 +237,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputNames(JNIEnv* env,
   if (interpreter == nullptr) return nullptr;
   jclass string_class = env->FindClass("java/lang/String");
   if (string_class == nullptr) {
-    throwException(env, kUnsupportedOperationException,
+    ThrowException(env, kUnsupportedOperationException,
                    "Internal error: Can not find java/lang/String class to get "
                    "output names.");
     return nullptr;
@@ -301,7 +323,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createModel(
   auto model = tflite::FlatBufferModel::VerifyAndBuildFromFile(
       path, verifier.get(), error_reporter);
   if (!model) {
-    throwException(env, kIllegalArgumentException,
+    ThrowException(env, kIllegalArgumentException,
                    "Contents of %s does not encode a valid "
                    "TensorFlowLite model: %s",
                    path, error_reporter->CachedErrorMessage());
@@ -322,7 +344,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createModelWithBuffer(
       static_cast<char*>(env->GetDirectBufferAddress(model_buffer));
   jlong capacity = env->GetDirectBufferCapacity(model_buffer);
   if (!VerifyModel(buf, capacity)) {
-    throwException(env, kIllegalArgumentException,
+    ThrowException(env, kIllegalArgumentException,
                    "ByteBuffer is not a valid flatbuffer model");
     return 0;
   }
@@ -330,7 +352,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createModelWithBuffer(
   auto model = tflite::FlatBufferModel::BuildFromBuffer(
       buf, static_cast<size_t>(capacity), error_reporter);
   if (!model) {
-    throwException(env, kIllegalArgumentException,
+    ThrowException(env, kIllegalArgumentException,
                    "ByteBuffer does not encode a valid model: %s",
                    error_reporter->CachedErrorMessage());
     return 0;
@@ -352,7 +374,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createInterpreter(
   TfLiteStatus status = tflite::InterpreterBuilder(*model, *(resolver.get()))(
       &interpreter, static_cast<int>(num_threads));
   if (status != kTfLiteOk) {
-    throwException(env, kIllegalArgumentException,
+    ThrowException(env, kIllegalArgumentException,
                    "Internal error: Cannot create interpreter: %s",
                    error_reporter->CachedErrorMessage());
     return 0;
@@ -373,7 +395,7 @@ JNIEXPORT void JNICALL Java_org_tensorflow_lite_NativeInterpreterWrapper_run(
   if (error_reporter == nullptr) return;
 
   if (interpreter->Invoke() != kTfLiteOk) {
-    throwException(env, kIllegalArgumentException,
+    ThrowException(env, kIllegalArgumentException,
                    "Internal error: Failed to run on the given Interpreter: %s",
                    error_reporter->CachedErrorMessage());
     return;
@@ -387,7 +409,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputDataType(
   if (interpreter == nullptr) return -1;
   const int idx = static_cast<int>(output_idx);
   if (output_idx < 0 || output_idx >= interpreter->outputs().size()) {
-    throwException(env, kIllegalArgumentException,
+    ThrowException(env, kIllegalArgumentException,
                    "Failed to get %d-th output out of %d outputs", output_idx,
                    interpreter->outputs().size());
     return -1;
@@ -404,7 +426,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputQuantizationZeroPoint
   if (interpreter == nullptr) return 0;
   const int idx = static_cast<int>(output_idx);
   if (output_idx < 0 || output_idx >= interpreter->outputs().size()) {
-    throwException(env, kIllegalArgumentException,
+    ThrowException(env, kIllegalArgumentException,
                    "Failed to get %d-th output out of %d outputs", output_idx,
                    interpreter->outputs().size());
     return 0;
@@ -420,7 +442,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputQuantizationScale(
   if (interpreter == nullptr) return 1.0f;
   const int idx = static_cast<int>(output_idx);
   if (output_idx < 0 || output_idx >= interpreter->outputs().size()) {
-    throwException(env, kIllegalArgumentException,
+    ThrowException(env, kIllegalArgumentException,
                    "Failed to get %d-th output out of %d outputs", output_idx,
                    interpreter->outputs().size());
     return 1.0f;
@@ -441,7 +463,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_resizeInput(
   if (interpreter == nullptr) return JNI_FALSE;
   const int idx = static_cast<int>(input_idx);
   if (idx < 0 || idx >= interpreter->inputs().size()) {
-    throwException(env, kIllegalArgumentException,
+    ThrowException(env, kIllegalArgumentException,
                    "Input error: Can not resize %d-th input for a model having "
                    "%d inputs.",
                    idx, interpreter->inputs().size());
@@ -454,7 +476,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_resizeInput(
     TfLiteStatus status = interpreter->ResizeInputTensor(
         interpreter->inputs()[idx], convertJIntArrayToVector(env, dims));
     if (status != kTfLiteOk) {
-      throwException(env, kIllegalArgumentException,
+      ThrowException(env, kIllegalArgumentException,
                      "Internal error: Failed to resize %d-th input: %s", idx,
                      error_reporter->CachedErrorMessage());
       return JNI_FALSE;
@@ -480,7 +502,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_applyDelegate(
 
   TfLiteStatus status = interpreter->ModifyGraphWithDelegate(delegate);
   if (status != kTfLiteOk) {
-    throwException(env, kIllegalArgumentException,
+    ThrowException(env, kIllegalArgumentException,
                    "Internal error: Failed to apply delegate: %s",
                    error_reporter->CachedErrorMessage());
   }
@@ -499,3 +521,7 @@ JNIEXPORT void JNICALL Java_org_tensorflow_lite_NativeInterpreterWrapper_delete(
     delete convertLongToErrorReporter(env, error_handle);
   }
 }
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.h b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
deleted file mode 100644
index e28e6a6d3c5..00000000000
--- a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
+++ /dev/null
@@ -1,257 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_NATIVEINTERPRETERWRAPPER_JNI_H_
-#define TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_NATIVEINTERPRETERWRAPPER_JNI_H_
-
-#include <jni.h>
-#include <stdio.h>
-#include <time.h>
-#include <vector>
-#include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/java/src/main/native/exception_jni.h"
-#include "tensorflow/lite/java/src/main/native/tensor_jni.h"
-#include "tensorflow/lite/model.h"
-
-namespace tflite {
-// This is to be provided at link-time by a library.
-extern std::unique_ptr<OpResolver> CreateOpResolver();
-}  // namespace tflite
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-/*
- *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
- *  Method:    allocateTensors
- *  Signature: (JJ)V
- */
-JNIEXPORT void JNICALL
-Java_org_tensorflow_lite_NativeInterpreterWrapper_allocateTensors(
-    JNIEnv* env, jclass clazz, jlong handle, jlong error_handle);
-
-/*
- *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
- *  Method:    getInputTensorIndex
- *  Signature: (JI)I
- */
-JNIEXPORT jint JNICALL
-Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputTensorIndex(
-    JNIEnv* env, jclass clazz, jlong handle, jint input_index);
-
-/*
- *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
- *  Method:    getOutputTensorIndex
- *  Signature: (JI)I
- */
-JNIEXPORT jint JNICALL
-Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputTensorIndex(
-    JNIEnv* env, jclass clazz, jlong handle, jint output_index);
-
-/*
- *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
- *  Method:    getInputCount
- *  Signature: (J)I
- */
-JNIEXPORT jint JNICALL
-Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputCount(JNIEnv* env,
-                                                                jclass clazz,
-                                                                jlong handle);
-
-/*
- *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
- *  Method:    getOutputCount
- *  Signature: (J)I
- */
-JNIEXPORT jint JNICALL
-Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputCount(JNIEnv* env,
-                                                                 jclass clazz,
-                                                                 jlong handle);
-
-/*
- *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
- *  Method:
- *  Signature: (J)[Ljava/lang/Object;
- */
-JNIEXPORT jobjectArray JNICALL
-Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputNames(JNIEnv* env,
-                                                                jclass clazz,
-                                                                jlong handle);
-
-/*
- *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
- *  Method:
- *  Signature: (J)[Ljava/lang/Object;
- */
-JNIEXPORT jobjectArray JNICALL
-Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputNames(JNIEnv* env,
-                                                                 jclass clazz,
-                                                                 jlong handle);
-
-/*
- *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
- *  Method:
- *  Signature: (JZ)V
- */
-JNIEXPORT void JNICALL
-Java_org_tensorflow_lite_NativeInterpreterWrapper_useNNAPI(JNIEnv* env,
-                                                           jclass clazz,
-                                                           jlong handle,
-                                                           jboolean state);
-
-/*
- *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
- *  Method:
- *  Signature: (JZ)V
- */
-JNIEXPORT void JNICALL
-Java_org_tensorflow_lite_NativeInterpreterWrapper_allowFp16PrecisionForFp32(
-    JNIEnv* env, jclass clazz, jlong handle, jboolean allow);
-
-/*
- *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
- *  Method:
- *  Signature: (JZ)V
- */
-JNIEXPORT void JNICALL
-Java_org_tensorflow_lite_NativeInterpreterWrapper_allowBufferHandleOutput(
-    JNIEnv* env, jclass clazz, jlong handle, jboolean allow);
-
-/*
- *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
- *  Method:
- *  Signature: (JI)V
- */
-JNIEXPORT void JNICALL
-Java_org_tensorflow_lite_NativeInterpreterWrapper_numThreads(JNIEnv* env,
-                                                             jclass clazz,
-                                                             jlong handle,
-                                                             jint num_threads);
-/*
- *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
- *  Method:
- *  Signature: (I)J
- */
-JNIEXPORT jlong JNICALL
-Java_org_tensorflow_lite_NativeInterpreterWrapper_createErrorReporter(
-    JNIEnv* env, jclass clazz, jint size);
-
-/*
- *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
- *  Method:
- *  Signature: (Ljava/lang/String;J)J
- */
-JNIEXPORT jlong JNICALL
-Java_org_tensorflow_lite_NativeInterpreterWrapper_createModel(
-    JNIEnv* env, jclass clazz, jstring model_file, jlong error_handle);
-
-/*
- *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
- *  Method:
- *  Signature: (Ljava/lang/Object;J)J
- */
-JNIEXPORT jlong JNICALL
-Java_org_tensorflow_lite_NativeInterpreterWrapper_createModelWithBuffer(
-    JNIEnv* env, jclass clazz, jobject model_buffer, jlong error_handle);
-
-/*
- *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
- *  Method:
- *  Signature: (JJI)J
- */
-JNIEXPORT jlong JNICALL
-Java_org_tensorflow_lite_NativeInterpreterWrapper_createInterpreter(
-    JNIEnv* env, jclass clazz, jlong model_handle, jlong error_handle,
-    jint num_threads);
-
-/*
- *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
- *  Method:    run
- *  Signature: (JJ)V
- */
-JNIEXPORT void JNICALL Java_org_tensorflow_lite_NativeInterpreterWrapper_run(
-    JNIEnv* env, jclass clazz, jlong interpreter_handle, jlong error_handle);
-
-/*
- *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
- *  Method:
- *  Signature: (JI)I
- *
- * Gets output dimensions.
- */
-JNIEXPORT jint JNICALL
-Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputDataType(
-    JNIEnv* env, jclass clazz, jlong handle, jint output_idx);
-
-/*
- *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
- *  Method:
- *  Signature: (JI)I
- *
- * Gets output quantization zero point.
- */
-JNIEXPORT jint JNICALL
-Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputQuantizationZeroPoint(
-    JNIEnv* env, jclass clazz, jlong handle, jint output_idx);
-
-/*
- *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
- *  Method:
- *  Signature: (JI)F
- *
- * Gets output quantization scale.
- */
-JNIEXPORT jfloat JNICALL
-Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputQuantizationScale(
-    JNIEnv* env, jclass clazz, jlong handle, jint output_idx);
-
-/*
- *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
- *  Method:
- *  Signature: (JJI[I)Z
- *
- * It returns true if resizing input tensor to different dimensions, else return
- * false.
- */
-JNIEXPORT jboolean JNICALL
-Java_org_tensorflow_lite_NativeInterpreterWrapper_resizeInput(
-    JNIEnv* env, jclass clazz, jlong interpreter_handle, jlong error_handle,
-    jint input_idx, jintArray dims);
-
-/*
- *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
- *  Method:    applyDelegate
- *  Signature: (JJJ)V
- */
-JNIEXPORT void JNICALL
-Java_org_tensorflow_lite_NativeInterpreterWrapper_applyDelegate(
-    JNIEnv* env, jclass clazz, jlong interpreter_handle, jlong error_handle,
-    jlong delegate_handle);
-
-/*
- *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
- *  Method:
- *  Signature: (JJJ)
- */
-JNIEXPORT void JNICALL Java_org_tensorflow_lite_NativeInterpreterWrapper_delete(
-    JNIEnv* env, jclass clazz, jlong error_handle, jlong model_handle,
-    jlong interpreter_handle);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
-#endif  // TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_NATIVEINTERPRETERWRAPPER_JNI_H_
diff --git a/tensorflow/lite/java/src/main/native/tensor_jni.cc b/tensorflow/lite/java/src/main/native/tensor_jni.cc
index f07437e7f31..31937517dbd 100644
--- a/tensorflow/lite/java/src/main/native/tensor_jni.cc
+++ b/tensorflow/lite/java/src/main/native/tensor_jni.cc
@@ -13,14 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/java/src/main/native/tensor_jni.h"
+#include <jni.h>
+
 #include <cstring>
 #include <memory>
+
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/java/src/main/native/exception_jni.h"
+#include "tensorflow/lite/java/src/main/native/jni_utils.h"
 #include "tensorflow/lite/string_util.h"
 
+using tflite::jni::ThrowException;
+
 namespace {
 
 // Convenience handle for obtaining a TfLiteTensor given an interpreter and
@@ -44,7 +48,7 @@ class TensorHandle {
 
 TfLiteTensor* GetTensorFromHandle(JNIEnv* env, jlong handle) {
   if (handle == 0) {
-    throwException(env, kIllegalArgumentException,
+    ThrowException(env, kIllegalArgumentException,
                    "Internal error: Invalid handle to TfLiteTensor.");
     return nullptr;
   }
@@ -53,7 +57,7 @@ TfLiteTensor* GetTensorFromHandle(JNIEnv* env, jlong handle) {
 
 int GetTensorIndexFromHandle(JNIEnv* env, jlong handle) {
   if (handle == 0) {
-    throwException(env, kIllegalArgumentException,
+    ThrowException(env, kIllegalArgumentException,
                    "Internal error: Invalid handle to TfLiteTensor.");
     return -1;
   }
@@ -95,7 +99,7 @@ size_t WriteOneDimensionalArray(JNIEnv* env, jobject object, TfLiteType type,
   const int num_elements = env->GetArrayLength(array);
   size_t to_copy = num_elements * ElementByteSize(type);
   if (to_copy > dst_size) {
-    throwException(env, kIllegalStateException,
+    ThrowException(env, kIllegalStateException,
                    "Internal error: cannot write Java array of %d bytes to "
                    "Tensor of %d bytes",
                    to_copy, dst_size);
@@ -127,7 +131,7 @@ size_t WriteOneDimensionalArray(JNIEnv* env, jobject object, TfLiteType type,
       return to_copy;
     }
     default: {
-      throwException(env, kUnsupportedOperationException,
+      ThrowException(env, kUnsupportedOperationException,
                      "DataType error: TensorFlowLite currently supports float "
                      "(32 bits), int (32 bits), byte (8 bits), and long "
                      "(64 bits), support for other types (DataType %d in this "
@@ -143,7 +147,7 @@ size_t ReadOneDimensionalArray(JNIEnv* env, TfLiteType data_type,
   const int len = env->GetArrayLength(dst);
   const size_t size = len * ElementByteSize(data_type);
   if (size > src_size) {
-    throwException(
+    ThrowException(
         env, kIllegalStateException,
         "Internal error: cannot fill a Java array of %d bytes with a Tensor of "
         "%d bytes",
@@ -175,7 +179,7 @@ size_t ReadOneDimensionalArray(JNIEnv* env, TfLiteType data_type,
       return size;
     }
     default: {
-      throwException(env, kIllegalStateException,
+      ThrowException(env, kIllegalStateException,
                      "DataType error: invalid DataType(%d)", data_type);
     }
   }
@@ -294,6 +298,10 @@ void WriteMultiDimensionalStringArray(JNIEnv* env, jobject src,
 
 }  // namespace
 
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
 JNIEXPORT jlong JNICALL Java_org_tensorflow_lite_Tensor_create(
     JNIEnv* env, jclass clazz, jlong interpreter_handle, jint tensor_index) {
   tflite::Interpreter* interpreter =
@@ -313,7 +321,7 @@ JNIEXPORT jobject JNICALL Java_org_tensorflow_lite_Tensor_buffer(JNIEnv* env,
   TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
   if (tensor == nullptr) return nullptr;
   if (tensor->data.raw == nullptr) {
-    throwException(env, kIllegalArgumentException,
+    ThrowException(env, kIllegalArgumentException,
                    "Internal error: Tensor hasn't been allocated.");
     return nullptr;
   }
@@ -328,7 +336,7 @@ JNIEXPORT void JNICALL Java_org_tensorflow_lite_Tensor_writeDirectBuffer(
 
   char* src_data_raw = static_cast<char*>(env->GetDirectBufferAddress(src));
   if (!src_data_raw) {
-    throwException(env, kIllegalArgumentException,
+    ThrowException(env, kIllegalArgumentException,
                    "Input ByteBuffer is not a direct buffer");
     return;
   }
@@ -345,7 +353,7 @@ Java_org_tensorflow_lite_Tensor_readMultiDimensionalArray(JNIEnv* env,
   if (tensor == nullptr) return;
   int num_dims = tensor->dims->size;
   if (num_dims == 0) {
-    throwException(env, kIllegalArgumentException,
+    ThrowException(env, kIllegalArgumentException,
                    "Internal error: Cannot copy empty/scalar Tensors.");
     return;
   }
@@ -367,12 +375,12 @@ Java_org_tensorflow_lite_Tensor_writeMultiDimensionalArray(JNIEnv* env,
   TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
   if (tensor == nullptr) return;
   if (tensor->type != kTfLiteString && tensor->data.raw == nullptr) {
-    throwException(env, kIllegalArgumentException,
+    ThrowException(env, kIllegalArgumentException,
                    "Internal error: Target Tensor hasn't been allocated.");
     return;
   }
   if (tensor->dims->size == 0) {
-    throwException(env, kIllegalArgumentException,
+    ThrowException(env, kIllegalArgumentException,
                    "Internal error: Cannot copy empty/scalar Tensors.");
     return;
   }
@@ -426,3 +434,7 @@ JNIEXPORT jint JNICALL Java_org_tensorflow_lite_Tensor_index(JNIEnv* env,
                                                              jlong handle) {
   return GetTensorIndexFromHandle(env, handle);
 }
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
diff --git a/tensorflow/lite/java/src/main/native/tensor_jni.h b/tensorflow/lite/java/src/main/native/tensor_jni.h
deleted file mode 100644
index a14f24a47d0..00000000000
--- a/tensorflow/lite/java/src/main/native/tensor_jni.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_TENSOR_JNI_H_
-#define TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_TENSOR_JNI_H_
-
-#include <jni.h>
-#include "tensorflow/lite/c/c_api_internal.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-/*
- * Class:     org_tensorflow_lite_Tensor
- * Method:    create
- * Signature: (JI)J
- */
-JNIEXPORT jlong JNICALL Java_org_tensorflow_lite_Tensor_create(
-    JNIEnv* env, jclass clazz, jlong interpreter_handle, jint tensor_index);
-
-/*
- * Class:     org_tensorflow_lite_Tensor
- * Method:    delete
- * Signature: (J)
- */
-JNIEXPORT void JNICALL Java_org_tensorflow_lite_Tensor_delete(JNIEnv* env,
-                                                              jclass clazz,
-                                                              jlong handle);
-
-/*
- * Class:     org_tensorflow_lite_Tensor
- * Method:    buffer
- * Signature: (J)Ljava/nio/ByteBuffer;
- */
-JNIEXPORT jobject JNICALL Java_org_tensorflow_lite_Tensor_buffer(JNIEnv* env,
-                                                                 jclass clazz,
-                                                                 jlong handle);
-
-/*
- *  Class:     org_tensorflow_lite_Tensor
- *  Method:    writeDirectBuffer
- *  Signature: (JLjava/nio/ByteBuffer;)
- */
-JNIEXPORT void JNICALL Java_org_tensorflow_lite_Tensor_writeDirectBuffer(
-    JNIEnv* env, jclass clazz, jlong handle, jobject src);
-
-/*
- *  Class:     org_tensorflow_lite_Tensor
- *  Method:    dtype
- *  Signature: (J)I
- */
-JNIEXPORT jint JNICALL Java_org_tensorflow_lite_Tensor_dtype(JNIEnv* env,
-                                                             jclass clazz,
-                                                             jlong handle);
-
-/*
- *  Class:     org_tensorflow_lite_Tensor
- *  Method:    shape
- *  Signature: (J)[I
- */
-JNIEXPORT jintArray JNICALL Java_org_tensorflow_lite_Tensor_shape(JNIEnv* env,
-                                                                  jclass clazz,
-                                                                  jlong handle);
-
-/*
- *  Class:     org_tensorflow_lite_Tensor
- *  Method:    numBytes
- *  Signature: (J)I
- */
-JNIEXPORT jint JNICALL Java_org_tensorflow_lite_Tensor_numBytes(JNIEnv* env,
-                                                                jclass clazz,
-                                                                jlong handle);
-
-/*
- *  Class:     org_tensorflow_lite_Tensor
- *  Method:    hasDelegateBufferHandle
- *  Signature: (J)Z
- */
-JNIEXPORT jboolean JNICALL
-Java_org_tensorflow_lite_Tensor_hasDelegateBufferHandle(JNIEnv* env,
-                                                        jclass clazz,
-                                                        jlong handle);
-
-/*
- *  Class:     org_tensorflow_lite_Tensor
- *  Method:    readMultiDimensionalArray
- *  Signature: (JLjava/lang/Object;)
- */
-JNIEXPORT void JNICALL
-Java_org_tensorflow_lite_Tensor_readMultiDimensionalArray(JNIEnv* env,
-                                                          jclass clazz,
-                                                          jlong handle,
-                                                          jobject dst);
-
-/*
- *  Class:     org_tensorflow_lite_Tensor
- *  Method:    writeMultidimensionalArray
- *  Signature: (JLjava/lang/Object;)
- */
-JNIEXPORT void JNICALL
-Java_org_tensorflow_lite_Tensor_writeMultiDimensionalArray(JNIEnv* env,
-                                                           jclass clazz,
-                                                           jlong handle,
-                                                           jobject src);
-
-/*
- *  Class:     org_tensorflow_lite_Tensor
- *  Method:    index
- *  Signature: (J)I
- */
-JNIEXPORT jint JNICALL Java_org_tensorflow_lite_Tensor_index(JNIEnv* env,
-                                                             jclass clazz,
-                                                             jlong handle);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
-#endif  // TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_TENSOR_JNI_H_
diff --git a/tensorflow/lite/java/src/main/native/tensorflow_lite_jni.cc b/tensorflow/lite/java/src/main/native/tensorflow_lite_jni.cc
index 54a97ce013a..e2d0dfdea43 100644
--- a/tensorflow/lite/java/src/main/native/tensorflow_lite_jni.cc
+++ b/tensorflow/lite/java/src/main/native/tensorflow_lite_jni.cc
@@ -13,11 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <jni.h>
 #include <stdio.h>
 
-#include "tensorflow/lite/java/src/main/native/tensorflow_lite_jni.h"
 #include "tensorflow/lite/version.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
 JNIEXPORT jstring JNICALL
 Java_org_tensorflow_lite_TensorFlowLite_runtimeVersion(JNIEnv* env,
                                                        jclass /*clazz*/) {
@@ -31,3 +35,7 @@ JNIEXPORT jstring JNICALL Java_org_tensorflow_lite_TensorFlowLite_schemaVersion(
   snprintf(buf, sizeof(buf), "%d", TFLITE_SCHEMA_VERSION);
   return env->NewStringUTF(buf);
 }
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
diff --git a/tensorflow/lite/java/src/main/native/tensorflow_lite_jni.h b/tensorflow/lite/java/src/main/native/tensorflow_lite_jni.h
deleted file mode 100644
index 7de218e7bf4..00000000000
--- a/tensorflow/lite/java/src/main/native/tensorflow_lite_jni.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_TENSORFLOW_LITE_JNI_H_
-#define TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_TENSORFLOW_LITE_JNI_H_
-
-#include <jni.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-/*
- *  Class:     org_tensorflow_lite_TensorFlowLite
- *  Method:    runtimeVersion
- *  Signature: ()Ljava/lang/String;
- */
-JNIEXPORT jstring JNICALL
-Java_org_tensorflow_lite_TensorFlowLite_runtimeVersion(JNIEnv*, jclass);
-
-/*
- *  Class:     org_tensorflow_lite_TensorFlowLite
- *  Method:    schemaVersion
- *  Signature: ()Ljava/lang/String;
- */
-JNIEXPORT jstring JNICALL
-Java_org_tensorflow_lite_TensorFlowLite_schemaVersion(JNIEnv*, jclass);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
-#endif  // TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_TENSORFLOW_LITE_JNI_H_
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index f3cf5b79308..c2e923c1023 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -145,6 +145,7 @@ cc_library(
     name = "cpu_backend_gemm",
     srcs = [
         "cpu_backend_gemm_ruy.h",
+        "cpu_backend_gemm_custom_gemv.h",
     ] + select({
         "//tensorflow/lite/kernels:tflite_with_ruy": [],
         "//conditions:default": [
@@ -157,9 +158,12 @@ cc_library(
         "cpu_backend_gemm.h",
         "cpu_backend_gemm_params.h",
     ],
+    copts = tflite_copts(),
     deps = [
         "//tensorflow/lite/kernels/internal:types",
+        "//tensorflow/lite/kernels/internal:common",
         ":cpu_backend_context",
+        ":cpu_backend_threadpool",
         # Depend on ruy regardless of `tflite_with_ruy`. See the comment in
         # cpu_backend_gemm.h about why ruy is the generic path.
         "//tensorflow/lite/experimental/ruy",
@@ -387,11 +391,13 @@ cc_library(
     srcs = ["lstm_eval.cc"],
     hdrs = ["lstm_eval.h"],
     deps = [
+        ":kernel_util",
         ":op_macros",
         "//tensorflow/lite/c:c_api_internal",
-        "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels/internal:kernel_utils",
         "//tensorflow/lite/kernels/internal:tensor_utils",
+        "//third_party/eigen3",
+        "@gemmlowp",
     ],
 )
 
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm.h b/tensorflow/lite/kernels/cpu_backend_gemm.h
index 18414117ef2..eccf69f19d3 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstdint>
 
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm_ruy.h"
 
@@ -92,6 +93,13 @@ void Gemm(const MatrixParams<LhsScalar>& lhs_params, const LhsScalar* lhs_data,
           const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params,
           CpuBackendContext* context) {
   ValidateParams(lhs_params, rhs_params, dst_params, params);
+  if (dst_params.cols == 1) {
+    // GEMV case: try a custom fast GEMV path.
+    if (detail::CustomGemv(lhs_params, lhs_data, rhs_params, rhs_data,
+                           dst_params, dst_data, params, context)) {
+      return;
+    }
+  }
   GemmImpl<LhsScalar, RhsScalar, AccumScalar, DstScalar,
            quantization_flavor>::Run(lhs_params, lhs_data, rhs_params, rhs_data,
                                      dst_params, dst_data, params, context);
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h b/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h
new file mode 100644
index 00000000000..1cbbe5ad677
--- /dev/null
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h
@@ -0,0 +1,590 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Fast Gemv (i.e. matrix*vector multiplication) paths.
+// TODO(b/132094390): remove when GEMM performance is good enough on GEMV cases.
+
+// TFLite's runtime ops concentrate as much as possible the matrix*vector
+// use cases on the (matrix) * (column-vector) case, as opposed to
+// (row-vector) * (matrix).  So that is what we focus on optimizing here.
+// Accordingly, the public cpu_backend_gemm::Gemm() entry point checks
+// if we are in this (matrix) * (column-vector) case, and if so calls
+// CustomGemv.
+//
+// cpu_backend_gemm::Gemm is also currently restricted (as enforced in
+// ValidateParams) to the case where the left-hand side matrix is row-major.
+//
+// So the current scope of this CustomGemv function really is:
+// (row-major matrix) * (column-vector).
+
+#ifndef TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_CUSTOM_GEMV_H_
+#define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_CUSTOM_GEMV_H_
+
+#include <type_traits>
+#include <vector>
+
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
+#include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace cpu_backend_gemm {
+namespace detail {
+
+// CustomGemvImpl is what needs to be specialized for each custom GEMV path.
+//
+// It does not deal with any multi-threaded implementation detail. Rather,
+// it provides the single-thread implementation to be run by each thread.
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar, QuantizationFlavor quantization_flavor>
+struct CustomGemvImpl {
+  // The number of rows of the left-hand-side matrix (and equivalently of the
+  // destination column-vector) that the kernel processes at a time.
+  // This will also be the minimum required number of rows for a Gemv shape
+  // to be supported by this path.
+  //
+  // Gemv implementations are expected to be able to deal with numbers of
+  // rows that aren't multiples of kKernelRows by possibly running the kernel
+  // again at an odd row_start, e.g. if kKernelRows==4, Run() should still
+  // support running on 7 rows by running twice: once with row_start=0 and then
+  // another time with row_start=3.
+  //
+  // On the other hand, gemv implementations are not expected to support
+  // running on fewer than kKernelRows rows. There is no interest in
+  // optimizing such narrow Gemv's that they are just a few dot-products.
+  // Supporting that would require custom kernel code only for that case.
+  static constexpr int kKernelRows = 1;
+
+  // Returns true if the Gemv shape is supported by Run(), provided that
+  // (row_end - row_start) > kKernelRows.
+  static bool IsSupportedGivenSufficientlyManyRows(
+      const MatrixParams<LhsScalar>& lhs_params,
+      const MatrixParams<RhsScalar>& rhs_params,
+      const MatrixParams<DstScalar>& dst_params,
+      const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params) {
+    return false;
+  }
+
+  // Performs the Gemv.
+  static void Run(
+      const MatrixParams<LhsScalar>& lhs_params, const LhsScalar* lhs_data,
+      const MatrixParams<RhsScalar>& rhs_params, const RhsScalar* rhs_data,
+      const MatrixParams<DstScalar>& dst_params, DstScalar* dst_data,
+      const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params,
+      int row_start, int row_end) {}
+};
+
+// Wraps CustomGemvImpl for multi-threaded operation.
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar, QuantizationFlavor quantization_flavor>
+class CustomGemvTask : public cpu_backend_threadpool::Task {
+ public:
+  CustomGemvTask(
+      const MatrixParams<LhsScalar>& lhs_params, const LhsScalar* lhs_data,
+      const MatrixParams<RhsScalar>& rhs_params, const RhsScalar* rhs_data,
+      const MatrixParams<DstScalar>& dst_params, DstScalar* dst_data,
+      const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params,
+      int row_start, int row_end)
+      : lhs_params_(lhs_params),
+        lhs_data_(lhs_data),
+        rhs_params_(rhs_params),
+        rhs_data_(rhs_data),
+        dst_params_(dst_params),
+        dst_data_(dst_data),
+        params_(params),
+        row_start_(row_start),
+        row_end_(row_end) {}
+
+  void Run() override {
+    using Impl = CustomGemvImpl<LhsScalar, RhsScalar, AccumScalar, DstScalar,
+                                quantization_flavor>;
+    Impl::Run(lhs_params_, lhs_data_, rhs_params_, rhs_data_, dst_params_,
+              dst_data_, params_, row_start_, row_end_);
+  }
+
+ private:
+  const MatrixParams<LhsScalar>& lhs_params_;
+  const LhsScalar* lhs_data_;
+  const MatrixParams<RhsScalar>& rhs_params_;
+  const RhsScalar* rhs_data_;
+  const MatrixParams<DstScalar>& dst_params_;
+  DstScalar* dst_data_;
+  const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params_;
+  int row_start_;
+  int row_end_;
+};
+
+// Either performs the requested Gemv operation and returns true,
+// or immediately returns false.
+//
+// See the comment at the top of the file for the scope of what this handles.
+// In summary: (row-major matrix) * (column-vector).
+//
+// Here is only high-level logic.
+// The actual implementation details are in specializations of
+// CustomGemvImpl.
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar, QuantizationFlavor quantization_flavor>
+bool CustomGemv(
+    const MatrixParams<LhsScalar>& lhs_params, const LhsScalar* lhs_data,
+    const MatrixParams<RhsScalar>& rhs_params, const RhsScalar* rhs_data,
+    const MatrixParams<DstScalar>& dst_params, DstScalar* dst_data,
+    const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params,
+    CpuBackendContext* context) {
+  using Impl = CustomGemvImpl<LhsScalar, RhsScalar, AccumScalar, DstScalar,
+                              quantization_flavor>;
+  if (lhs_params.rows < Impl::kKernelRows) {
+    return false;
+  }
+  if (!Impl::IsSupportedGivenSufficientlyManyRows(lhs_params, rhs_params,
+                                                  dst_params, params)) {
+    return false;
+  }
+  TFLITE_DCHECK_GE(lhs_params.rows, Impl::kKernelRows);
+  int thread_count = LegacyHowManyThreads<Impl::kKernelRows>(
+      context->max_num_threads(), dst_params.rows, dst_params.cols,
+      lhs_params.cols);
+  if (thread_count == 1) {
+    Impl::Run(lhs_params, lhs_data, rhs_params, rhs_data, dst_params, dst_data,
+              params, 0, lhs_params.rows);
+  } else {
+    using Task = CustomGemvTask<LhsScalar, RhsScalar, AccumScalar, DstScalar,
+                                quantization_flavor>;
+    std::vector<Task> tasks;
+    tasks.reserve(thread_count);
+    const int kRowsPerThread =
+        RoundUp<Impl::kKernelRows>(CeilQuotient(dst_params.rows, thread_count));
+    int row_start = 0;
+    for (int i = 0; i < thread_count; i++) {
+      int row_end = std::min(dst_params.rows, row_start + kRowsPerThread);
+      tasks.emplace_back(lhs_params, lhs_data, rhs_params, rhs_data, dst_params,
+                         dst_data, params, row_start, row_end);
+      row_start = row_end;
+    }
+    cpu_backend_threadpool::Execute(tasks.size(), tasks.data(), context);
+  }
+  return true;
+}
+
+// USE_NEON still allows for x86 where we may be using the arm_neon_sse.h
+// wrapper implementing NEON intrinsics on top of SSE4 intrinsics.
+#ifdef USE_NEON
+
+// Some NEON helper functions used by CustomGemvImpl specializations below,
+// allowing for some type genericity in them.
+
+inline int16x8x2_t LoadAndSubtractZeroPoint(const std::uint8_t* src,
+                                            std::uint8_t zero_point) {
+  uint8x16_t src_u8 = vld1q_u8(src);
+  int16x8_t src_s16_0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src_u8)));
+  int16x8_t src_s16_1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src_u8)));
+  int16x8x2_t result;
+  int16x8_t zero_point_vec = vdupq_n_s16(zero_point);
+  result.val[0] = vsubq_s16(src_s16_0, zero_point_vec);
+  result.val[1] = vsubq_s16(src_s16_1, zero_point_vec);
+  return result;
+}
+
+inline int16x8x2_t LoadAndSubtractZeroPoint(const std::int8_t* src,
+                                            std::int8_t zero_point) {
+  int8x16_t src_s8 = vld1q_s8(src);
+  int16x8_t src_s16_0 = vmovl_s8(vget_low_s8(src_s8));
+  int16x8_t src_s16_1 = vmovl_s8(vget_high_s8(src_s8));
+  int16x8x2_t result;
+  int16x8_t zero_point_vec = vdupq_n_s16(zero_point);
+  result.val[0] = vsubq_s16(src_s16_0, zero_point_vec);
+  result.val[1] = vsubq_s16(src_s16_1, zero_point_vec);
+  return result;
+}
+
+inline void ClampAndStore(int32x4_t src, std::uint8_t clamp_min,
+                          std::uint8_t clamp_max, std::uint8_t* dst) {
+  // Narrow values down to 16 bit signed.
+  const int16x4_t res16 = vqmovn_s32(src);
+  // Narrow values down to 8 bit unsigned, saturating.
+  uint8x8_t res8 = vqmovun_s16(vcombine_s16(res16, res16));
+  // Apply the clamping from the activation function
+  res8 = vmax_u8(res8, vdup_n_u8(clamp_min));
+  res8 = vmin_u8(res8, vdup_n_u8(clamp_max));
+  // Store results to destination.
+  vst1_lane_u8(dst + 0, res8, 0);
+  vst1_lane_u8(dst + 1, res8, 1);
+  vst1_lane_u8(dst + 2, res8, 2);
+  vst1_lane_u8(dst + 3, res8, 3);
+}
+
+inline void ClampAndStore(int32x4_t src, std::int8_t clamp_min,
+                          std::int8_t clamp_max, std::int8_t* dst) {
+  // Narrow values down to 16 bit signed.
+  const int16x4_t res16 = vqmovn_s32(src);
+  // Narrow values down to 8 bit unsigned, saturating.
+  int8x8_t res8 = vqmovn_s16(vcombine_s16(res16, res16));
+  // Apply the clamping from the activation function
+  res8 = vmax_s8(res8, vdup_n_s8(clamp_min));
+  res8 = vmin_s8(res8, vdup_n_s8(clamp_max));
+  // Store results to destination.
+  vst1_lane_s8(dst + 0, res8, 0);
+  vst1_lane_s8(dst + 1, res8, 1);
+  vst1_lane_s8(dst + 2, res8, 2);
+  vst1_lane_s8(dst + 3, res8, 3);
+}
+
+inline void ClampAndStore(int32x4_t src, std::int16_t clamp_min,
+                          std::int16_t clamp_max, std::int16_t* dst) {
+  // Narrow values down to 16 bit signed.
+  int16x4_t res16 = vqmovn_s32(src);
+  // Apply the clamping from the activation function
+  res16 = vmax_s16(res16, vdup_n_s16(clamp_min));
+  res16 = vmin_s16(res16, vdup_n_s16(clamp_max));
+  // Store results to destination.
+  vst1_lane_s16(dst + 0, res16, 0);
+  vst1_lane_s16(dst + 1, res16, 1);
+  vst1_lane_s16(dst + 2, res16, 2);
+  vst1_lane_s16(dst + 3, res16, 3);
+}
+
+template <typename LhsScalar, typename RhsScalar, typename DstScalar,
+          QuantizationFlavor quantization_flavor>
+struct CustomGemvImpl<LhsScalar, RhsScalar, std::int32_t, DstScalar,
+                      quantization_flavor> {
+  // This partial template specialization is less generic than its declaration
+  // implies: it assumes the following constraints on its free template
+  // parameters. We guard these assumptions in the following static_assert's.
+  static_assert(std::is_same<LhsScalar, std::uint8_t>::value ||
+                    std::is_same<LhsScalar, std::int8_t>::value,
+                "");
+  static_assert(std::is_same<RhsScalar, std::uint8_t>::value ||
+                    std::is_same<RhsScalar, std::int8_t>::value,
+                "");
+  static_assert(std::is_same<DstScalar, std::uint8_t>::value ||
+                    std::is_same<DstScalar, std::int8_t>::value ||
+                    std::is_same<DstScalar, std::int16_t>::value,
+                "");
+  static_assert(quantization_flavor ==
+                        QuantizationFlavor::kIntegerWithUniformMultiplier ||
+                    quantization_flavor ==
+                        QuantizationFlavor::kIntegerWithPerRowMultiplier,
+                "");
+
+  // This implementation's inner loop processes 4 rows of the left-hand side
+  // matrix at a time.
+  static constexpr int kKernelRows = 4;
+
+  static bool IsSupportedGivenSufficientlyManyRows(
+      const MatrixParams<LhsScalar>& lhs_params,
+      const MatrixParams<RhsScalar>& rhs_params,
+      const MatrixParams<DstScalar>& dst_params,
+      const GemmParams<std::int32_t, DstScalar, quantization_flavor>& params) {
+    // There are no further requirements on the applicability of this kernel,
+    // beyond the left-hand-side matrix having at least kKernelRows rows,
+    // and the type requirements implied in this template partial
+    // specialization.
+    return true;
+  }
+
+  static void Run(
+      const MatrixParams<LhsScalar>& lhs_params, const LhsScalar* lhs_data,
+      const MatrixParams<RhsScalar>& rhs_params, const RhsScalar* rhs_data,
+      const MatrixParams<DstScalar>& dst_params, DstScalar* dst_data,
+      const GemmParams<std::int32_t, DstScalar, quantization_flavor>& params,
+      int row_start, int row_end) {
+    // Handle kKernelRows ( == 4) rows of the left-hand side matrix at each
+    // iteration of this for loop.
+    TFLITE_DCHECK_GE(row_end - row_start, kKernelRows);
+    for (int row = row_start; row < row_end; row += kKernelRows) {
+      // Here is the magic where we allow this kernel to handle any odd number
+      // of rows as long as it's >= kKernelRows: the last group of `kKernelRows`
+      // rows will be nudged to fit, possibly by starting at an odd value of
+      // `row`.
+      row = std::min(row, row_end - kKernelRows);
+      const LhsScalar* filter_ptr = lhs_data + row * lhs_params.cols;
+      // 4 accumulator registers, one for each row being processed.
+      // Each has 4 int32 lanes that corresponds to columns modulo 4, and
+      // will need to be horizontally reduced at the end.
+      int32x4_t acc0 = vdupq_n_s32(0);
+      int32x4_t acc1 = acc0;
+      int32x4_t acc2 = acc0;
+      int32x4_t acc3 = acc0;
+      int in = 0;
+      // As much as possible, handle 16 columns of the left-hand side matrix
+      // at a time. This allows for decent NEON implementation.
+      for (; in <= lhs_params.cols - 16; in += 16) {
+        int16x8x2_t input_val =
+            LoadAndSubtractZeroPoint(rhs_data + in, rhs_params.zero_point);
+        int16x8x2_t filter_val_0 = LoadAndSubtractZeroPoint(
+            filter_ptr + 0 * lhs_params.cols, lhs_params.zero_point);
+        int16x8x2_t filter_val_1 = LoadAndSubtractZeroPoint(
+            filter_ptr + 1 * lhs_params.cols, lhs_params.zero_point);
+        int16x8x2_t filter_val_2 = LoadAndSubtractZeroPoint(
+            filter_ptr + 2 * lhs_params.cols, lhs_params.zero_point);
+        int16x8x2_t filter_val_3 = LoadAndSubtractZeroPoint(
+            filter_ptr + 3 * lhs_params.cols, lhs_params.zero_point);
+        filter_ptr += 16;
+        acc0 = vmlal_s16(acc0, vget_low_s16(filter_val_0.val[0]),
+                         vget_low_s16(input_val.val[0]));
+        acc1 = vmlal_s16(acc1, vget_low_s16(filter_val_1.val[0]),
+                         vget_low_s16(input_val.val[0]));
+        acc2 = vmlal_s16(acc2, vget_low_s16(filter_val_2.val[0]),
+                         vget_low_s16(input_val.val[0]));
+        acc3 = vmlal_s16(acc3, vget_low_s16(filter_val_3.val[0]),
+                         vget_low_s16(input_val.val[0]));
+        acc0 = vmlal_s16(acc0, vget_low_s16(filter_val_0.val[1]),
+                         vget_low_s16(input_val.val[1]));
+        acc1 = vmlal_s16(acc1, vget_low_s16(filter_val_1.val[1]),
+                         vget_low_s16(input_val.val[1]));
+        acc2 = vmlal_s16(acc2, vget_low_s16(filter_val_2.val[1]),
+                         vget_low_s16(input_val.val[1]));
+        acc3 = vmlal_s16(acc3, vget_low_s16(filter_val_3.val[1]),
+                         vget_low_s16(input_val.val[1]));
+        acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0.val[0]),
+                         vget_high_s16(input_val.val[0]));
+        acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1.val[0]),
+                         vget_high_s16(input_val.val[0]));
+        acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2.val[0]),
+                         vget_high_s16(input_val.val[0]));
+        acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3.val[0]),
+                         vget_high_s16(input_val.val[0]));
+        acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0.val[1]),
+                         vget_high_s16(input_val.val[1]));
+        acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1.val[1]),
+                         vget_high_s16(input_val.val[1]));
+        acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2.val[1]),
+                         vget_high_s16(input_val.val[1]));
+        acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3.val[1]),
+                         vget_high_s16(input_val.val[1]));
+      }
+      // Leftovers: fewer than 16 columns remain. Very slow code, could be
+      // improved upon if critical in some application.
+      if (in < lhs_params.cols) {
+        int32 buf[16];
+        vst1q_s32(buf + 0, acc0);
+        vst1q_s32(buf + 4, acc1);
+        vst1q_s32(buf + 8, acc2);
+        vst1q_s32(buf + 12, acc3);
+        for (; in < lhs_params.cols; in++) {
+          int lane = (in + 16 - lhs_params.cols) % 4;
+          const int32 input_val = rhs_data[in] - rhs_params.zero_point;
+          for (int k = 0; k < 4; k++) {
+            int32 filter_val = lhs_data[in + (row + k) * lhs_params.cols] -
+                               lhs_params.zero_point;
+            buf[lane + 4 * k] += filter_val * input_val;
+          }
+        }
+        acc0 = vld1q_s32(buf + 0);
+        acc1 = vld1q_s32(buf + 4);
+        acc2 = vld1q_s32(buf + 8);
+        acc3 = vld1q_s32(buf + 12);
+      }
+
+      // Horizontally reduce accumulators
+      int32x2_t pairwise_reduced_acc_0 =
+          vpadd_s32(vget_low_s32(acc0), vget_high_s32(acc0));
+      int32x2_t pairwise_reduced_acc_1 =
+          vpadd_s32(vget_low_s32(acc1), vget_high_s32(acc1));
+      int32x2_t pairwise_reduced_acc_2 =
+          vpadd_s32(vget_low_s32(acc2), vget_high_s32(acc2));
+      int32x2_t pairwise_reduced_acc_3 =
+          vpadd_s32(vget_low_s32(acc3), vget_high_s32(acc3));
+      const int32x2_t reduced_lo =
+          vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
+      const int32x2_t reduced_hi =
+          vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
+      int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
+      // End of horizontal reduction: now `reduced` is a single int32x4
+      // containing the 4 int32 accumulators corresponding to the 4 rows
+      // being processed.
+
+      // Add bias values.
+      int32x4_t bias_vec = vld1q_s32(params.bias + row);
+      reduced = vaddq_s32(reduced, bias_vec);
+
+      // Get multiplier parameters.
+      int multiplier_exponent;
+      std::int32_t multiplier_fixedpoint;
+      if (quantization_flavor ==
+          QuantizationFlavor::kIntegerWithPerRowMultiplier) {
+        multiplier_exponent = params.multiplier_exponent_perchannel[row];
+        multiplier_fixedpoint = params.multiplier_fixedpoint_perchannel[row];
+      } else {
+        multiplier_exponent = params.multiplier_exponent;
+        multiplier_fixedpoint = params.multiplier_fixedpoint;
+      }
+
+      // If positive exponent, shift left.
+      if (multiplier_exponent > 0) {
+        reduced = vshlq_s32(reduced, vdupq_n_s32(multiplier_exponent));
+      }
+      // Multiply by the fixed-point multiplier.
+      reduced = vqrdmulhq_n_s32(reduced, multiplier_fixedpoint);
+      // If negative exponent, rounding-shift-right.
+      if (multiplier_exponent < 0) {
+        using gemmlowp::RoundingDivideByPOT;
+        reduced = RoundingDivideByPOT(reduced, -multiplier_exponent);
+      }
+
+      // Add the output offset.
+      const int32x4_t output_offset_vec = vdupq_n_s32(dst_params.zero_point);
+      reduced = vaddq_s32(reduced, output_offset_vec);
+
+      // Finally, clamp and store to the destination.
+      ClampAndStore(reduced, params.clamp_min, params.clamp_max,
+                    dst_data + row);
+    }
+  }
+};
+
+// The float specialization below is unconditionally faster than ruy
+// because ruy does not currently have any Gemv path.
+// But it is not unconditionally faster than Eigen, which is what is used
+// unless TFLITE_WITH_RUY is defined. Indeed, Eigen has decently efficient
+// Gemv paths, and they may use AVX instructions, while the present
+// NEON intrinsics code maps at best to SSE4 on x86.
+#ifdef TFLITE_WITH_RUY
+
+// We want to use fused multiply-add when it's available (that is, on A64
+// unconditionally and on A32 with VFPv4) because it's often faster, and
+// because non-fused seems not to be available in A64 so a conscentious compiler
+// might emit slow code (separate mul and add instructions) in order to
+// implement the vmlaq_f32 intrinsic with strict bit-for-bit exactness on A64.
+// (Compilers seems to be generating a fused fmla instruction at the moment,
+// but that could change).
+//
+// We still want to support building for A32 without VFPv4.
+inline float32x4_t mul_add(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
+#ifdef __ARM_FEATURE_FMA
+  return vfmaq_f32(acc, lhs, rhs);
+#else
+  return vmlaq_f32(acc, lhs, rhs);
+#endif
+}
+
+template <>
+struct CustomGemvImpl<float, float, float, float,
+                      QuantizationFlavor::kFloatingPoint> {
+  // This implementation's inner loop processes 4 rows of the left-hand side
+  // matrix at a time.
+  static constexpr int kKernelRows = 4;
+
+  static bool IsSupportedGivenSufficientlyManyRows(
+      const MatrixParams<float>& lhs_params,
+      const MatrixParams<float>& rhs_params,
+      const MatrixParams<float>& dst_params,
+      const GemmParams<float, float>& params) {
+    // There are no further requirements on the applicability of this kernel,
+    // beyond the left-hand-side matrix having at least kKernelRows rows,
+    // and the type requirements implied in this template partial
+    // specialization.
+    return true;
+  }
+  static void Run(const MatrixParams<float>& lhs_params, const float* lhs_data,
+                  const MatrixParams<float>& rhs_params, const float* rhs_data,
+                  const MatrixParams<float>& dst_params, float* dst_data,
+                  const GemmParams<float, float>& params, int row_start,
+                  int row_end) {
+    // Handle kKernelRows ( == 4) rows of the left-hand side matrix at each
+    // iteration of this for loop.
+    TFLITE_DCHECK_GE(row_end - row_start, kKernelRows);
+    for (int row = row_start; row < row_end; row += kKernelRows) {
+      // Here is the magic where we allow this kernel to handle any odd number
+      // of rows as long as it's >= kKernelRows: the last group of `kKernelRows`
+      // rows will be nudged to fit, possibly by starting at an odd value of
+      // `row`.
+      row = std::min(row, row_end - kKernelRows);
+      const float* filter_ptr = lhs_data + row * lhs_params.cols;
+      // 4 accumulator registers, one for each row being processed.
+      // Each has 4 float32 lanes that corresponds to columns modulo 4, and
+      // will need to be horizontally reduced at the end.
+      float32x4_t acc0 = vdupq_n_f32(0);
+      float32x4_t acc1 = acc0;
+      float32x4_t acc2 = acc0;
+      float32x4_t acc3 = acc0;
+      int in = 0;
+      // As much as possible, handle 4 columns of the left-hand side matrix
+      // at a time. This allows for decent NEON implementation.
+      for (; in <= lhs_params.cols - 4; in += 4) {
+        float32x4_t input_val = vld1q_f32(rhs_data + in);
+        float32x4_t filter_val_0 = vld1q_f32(filter_ptr + 0 * lhs_params.cols);
+        float32x4_t filter_val_1 = vld1q_f32(filter_ptr + 1 * lhs_params.cols);
+        float32x4_t filter_val_2 = vld1q_f32(filter_ptr + 2 * lhs_params.cols);
+        float32x4_t filter_val_3 = vld1q_f32(filter_ptr + 3 * lhs_params.cols);
+        filter_ptr += 4;
+        acc0 = mul_add(acc0, filter_val_0, input_val);
+        acc1 = mul_add(acc1, filter_val_1, input_val);
+        acc2 = mul_add(acc2, filter_val_2, input_val);
+        acc3 = mul_add(acc3, filter_val_3, input_val);
+      }
+      // Leftovers: fewer than 4 columns remain. Very slow code, could be
+      // improved upon if critical in some application.
+      if (in < lhs_params.cols) {
+        float buf[16];
+        vst1q_f32(buf + 0, acc0);
+        vst1q_f32(buf + 4, acc1);
+        vst1q_f32(buf + 8, acc2);
+        vst1q_f32(buf + 12, acc3);
+        for (; in < lhs_params.cols; in++) {
+          int lane = (in + 4 - lhs_params.cols) % 4;
+          const float input_val = rhs_data[in];
+          for (int k = 0; k < 4; k++) {
+            float filter_val = lhs_data[in + (row + k) * lhs_params.cols];
+            buf[lane + 4 * k] += filter_val * input_val;
+          }
+        }
+        acc0 = vld1q_f32(buf + 0);
+        acc1 = vld1q_f32(buf + 4);
+        acc2 = vld1q_f32(buf + 8);
+        acc3 = vld1q_f32(buf + 12);
+      }
+
+      // Horizontally reduce accumulators
+      float32x2_t pairwise_reduced_acc_0 =
+          vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
+      float32x2_t pairwise_reduced_acc_1 =
+          vpadd_f32(vget_low_f32(acc1), vget_high_f32(acc1));
+      float32x2_t pairwise_reduced_acc_2 =
+          vpadd_f32(vget_low_f32(acc2), vget_high_f32(acc2));
+      float32x2_t pairwise_reduced_acc_3 =
+          vpadd_f32(vget_low_f32(acc3), vget_high_f32(acc3));
+      float32x2_t reduced_lo =
+          vpadd_f32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
+      float32x2_t reduced_hi =
+          vpadd_f32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
+      float32x4_t reduced = vcombine_f32(reduced_lo, reduced_hi);
+      // End of horizontal reduction: now `reduced` is a single float32x4
+      // containing the 4 float32 accumulators corresponding to the 4 rows
+      // being processed.
+
+      if (params.bias) {
+        // Add bias values.
+        reduced = vaddq_f32(reduced, vld1q_f32(params.bias + row));
+      }
+
+      // Clamp and store to destination.
+      reduced = vminq_f32(reduced, vdupq_n_f32(params.clamp_max));
+      reduced = vmaxq_f32(reduced, vdupq_n_f32(params.clamp_min));
+      vst1q_f32(dst_data + row, reduced);
+    }
+  }
+};
+
+#endif  // TFLITE_WITH_RUY
+
+#endif  // USE_NEON
+
+}  // namespace detail
+}  // namespace cpu_backend_gemm
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_CUSTOM_GEMV_H_
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_eigen.cc b/tensorflow/lite/kernels/cpu_backend_gemm_eigen.cc
index f00278a1180..9a78ddd8c23 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_eigen.cc
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_eigen.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
+#include "tensorflow/lite/kernels/internal/common.h"
 
 namespace tflite {
 namespace cpu_backend_gemm {
@@ -47,8 +48,6 @@ void GemmImplUsingEigen::Run(
                                      Eigen::ColMajor>>;
   using EigenMatrixMapColMajorMutable = Eigen::Map<
       Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>>;
-  using EigenVectorMapConst = Eigen::Map<
-      const Eigen::Matrix<float, Eigen::Dynamic, 1, Eigen::ColMajor>>;
 
   EigenMatrixMapRowMajorConst eigen_lhs(lhs_data, lhs_params.rows,
                                         lhs_params.cols);
@@ -57,10 +56,6 @@ void GemmImplUsingEigen::Run(
   EigenMatrixMapColMajorMutable eigen_dst(dst_data, dst_params.rows,
                                           dst_params.cols);
 
-  // Likewise, the assumption that params.bias != nullptr has already been
-  // checked.
-  EigenVectorMapConst eigen_bias(params.bias, lhs_params.rows);
-
   if (rhs_params.cols == 1) {
     eigen_dst.col(0).noalias() = eigen_lhs * eigen_rhs.col(0);
   } else if (lhs_params.rows == 1) {
@@ -69,9 +64,12 @@ void GemmImplUsingEigen::Run(
     eigen_dst.noalias() = eigen_lhs * eigen_rhs;
   }
 
-  eigen_dst = (eigen_dst.colwise() + eigen_bias)
-                  .cwiseMin(params.clamp_max)
-                  .cwiseMax(params.clamp_min);
+  if (params.bias) {
+    BiasAndClamp(params.clamp_min, params.clamp_max, dst_params.rows,
+                 params.bias, dst_params.rows * dst_params.cols, dst_data);
+  } else {
+    eigen_dst = eigen_dst.cwiseMin(params.clamp_max).cwiseMax(params.clamp_min);
+  }
 }
 
 }  // namespace detail
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_params.h b/tensorflow/lite/kernels/cpu_backend_gemm_params.h
index a20218f6e89..40e81dcfeae 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_params.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_params.h
@@ -150,10 +150,6 @@ template <typename AccumScalar, typename DstScalar,
           QuantizationFlavor quantization_flavor>
 void ValidateGemmParams(
     const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params) {
-  // For now require a bias vector. Again, ruy does not rely on that requirement
-  // but the gemmlowp and Eigen path would require more code to handle it,
-  // and currently TFLite only uses the case where there is a bias vector.
-  TFLITE_DCHECK(params.bias);
   // Guard consistency of the quantized multiplier fields.
   if (quantization_flavor == QuantizationFlavor::kFloatingPoint) {
     TFLITE_DCHECK(!params.multiplier_fixedpoint);
@@ -162,12 +158,20 @@ void ValidateGemmParams(
     TFLITE_DCHECK(!params.multiplier_exponent_perchannel);
   } else if (quantization_flavor ==
              QuantizationFlavor::kIntegerWithUniformMultiplier) {
+    // For now require a bias vector. Ruy does not care, but for gemmlowp
+    // it's a separate instantiation of the whole GEMM, so we save a lot of
+    // binary size by requiring a bias vector, and that's what we've been
+    // doing all along in our usage of gemmlowp, so somehow that must
+    // be OK with all existing users.
+    TFLITE_DCHECK(params.bias);
     TFLITE_DCHECK(params.multiplier_fixedpoint);
     // Nothing to check about multiplier_exponent
     TFLITE_DCHECK(!params.multiplier_fixedpoint_perchannel);
     TFLITE_DCHECK(!params.multiplier_exponent_perchannel);
   } else if (quantization_flavor ==
              QuantizationFlavor::kIntegerWithPerRowMultiplier) {
+    // See above comment about requiring bias.
+    TFLITE_DCHECK(params.bias);
     TFLITE_DCHECK(!params.multiplier_fixedpoint);
     TFLITE_DCHECK(!params.multiplier_exponent);
     TFLITE_DCHECK(params.multiplier_fixedpoint_perchannel);
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_test.cc b/tensorflow/lite/kernels/cpu_backend_gemm_test.cc
index 884f9c96ab9..5f8210f6c5f 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_test.cc
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_test.cc
@@ -418,7 +418,12 @@ void TestSomeGemm(int rows, int depth, int cols,
   }
 
   GemmParams<AccumScalar, DstScalar> params;
-  params.bias = bias_data.data();
+  if (use_golden || !std::is_floating_point<AccumScalar>::value ||
+      (random_engine() % 2)) {
+    // cpu_backend_gemm supports bias=null only in the float path. Test that
+    // in 50% of float testcases.
+    params.bias = bias_data.data();
+  }
   if (!std::is_floating_point<AccumScalar>::value) {
     // some large int32 value. Not being a multiple of a large
     // power of two helps testing rounding behavior.
@@ -545,7 +550,7 @@ TYPED_TEST(CpuBackendGemmTest, Square) {
 
 TYPED_TEST(CpuBackendGemmTest, SquarePowerOfTwo) {
   std::vector<std::tuple<int, int, int>> shapes;
-  for (int size = 64; size <= 128; size++) {
+  for (int size = 64; size <= 128; size *= 2) {
     shapes.push_back(std::make_tuple(size, size, size));
   }
   TestRandomGemms<TypeParam>(shapes);
@@ -569,7 +574,7 @@ TYPED_TEST(CpuBackendGemmTest, VectorTimesMatrix) {
 
 TYPED_TEST(CpuBackendGemmTest, MatrixTimesNarrow) {
   std::vector<std::tuple<int, int, int>> shapes;
-  for (int size = 1; size < 100; size++) {
+  for (int size = 1; size < 50; size++) {
     shapes.push_back(std::make_tuple(size, size, 2));
     shapes.push_back(std::make_tuple(size, size, 3));
     shapes.push_back(std::make_tuple(size, size, 4));
@@ -607,7 +612,7 @@ TYPED_TEST(CpuBackendGemmTest, InnerProduct) {
 
 TYPED_TEST(CpuBackendGemmTest, OuterProduct) {
   std::vector<std::tuple<int, int, int>> shapes;
-  for (int size = 1; size < 200; size++) {
+  for (int size = 1; size < 100; size++) {
     shapes.push_back(std::make_tuple(size, 1, size));
   }
   TestRandomGemms<TypeParam>(shapes);
diff --git a/tensorflow/lite/kernels/detection_postprocess.cc b/tensorflow/lite/kernels/detection_postprocess.cc
index 4e8600290c9..8285b4d9b84 100644
--- a/tensorflow/lite/kernels/detection_postprocess.cc
+++ b/tensorflow/lite/kernels/detection_postprocess.cc
@@ -230,10 +230,16 @@ class Dequantizer {
 
 void DequantizeBoxEncodings(const TfLiteTensor* input_box_encodings, int idx,
                             float quant_zero_point, float quant_scale,
+                            int length_box_encoding,
                             CenterSizeEncoding* box_centersize) {
   const uint8* boxes =
-      GetTensorData<uint8>(input_box_encodings) + kNumCoordBox * idx;
+      GetTensorData<uint8>(input_box_encodings) + length_box_encoding * idx;
   Dequantizer dequantize(quant_zero_point, quant_scale);
+  // See definition of the KeyPointBoxCoder at
+  // https://github.com/tensorflow/models/blob/master/research/object_detection/box_coders/keypoint_box_coder.py
+  // The first four elements are the box coordinates, which is the same as the
+  // FastRnnBoxCoder at
+  // https://github.com/tensorflow/models/blob/master/research/object_detection/box_coders/faster_rcnn_box_coder.py
   box_centersize->y = dequantize(boxes[0]);
   box_centersize->x = dequantize(boxes[1]);
   box_centersize->h = dequantize(boxes[2]);
@@ -261,7 +267,7 @@ TfLiteStatus DecodeCenterSizeBoxes(TfLiteContext* context, TfLiteNode* node,
       GetInput(context, node, kInputTensorBoxEncodings);
   TF_LITE_ENSURE_EQ(context, input_box_encodings->dims->data[0], kBatchSize);
   const int num_boxes = input_box_encodings->dims->data[1];
-  TF_LITE_ENSURE_EQ(context, input_box_encodings->dims->data[2], kNumCoordBox);
+  TF_LITE_ENSURE(context, input_box_encodings->dims->data[2] >= kNumCoordBox);
   const TfLiteTensor* input_anchors =
       GetInput(context, node, kInputTensorAnchors);
 
@@ -277,19 +283,24 @@ TfLiteStatus DecodeCenterSizeBoxes(TfLiteContext* context, TfLiteNode* node,
             input_box_encodings, idx,
             static_cast<float>(input_box_encodings->params.zero_point),
             static_cast<float>(input_box_encodings->params.scale),
-            &box_centersize);
+            input_box_encodings->dims->data[2], &box_centersize);
         DequantizeBoxEncodings(
             input_anchors, idx,
             static_cast<float>(input_anchors->params.zero_point),
-            static_cast<float>(input_anchors->params.scale), &anchor);
+            static_cast<float>(input_anchors->params.scale), kNumCoordBox,
+            &anchor);
         break;
         // Float
-      case kTfLiteFloat32:
-        box_centersize = ReInterpretTensor<const CenterSizeEncoding*>(
-            input_box_encodings)[idx];
+      case kTfLiteFloat32: {
+        // Please see DequantizeBoxEncodings function for the support detail.
+        const int box_encoding_idx = idx * input_box_encodings->dims->data[2];
+        const float* boxes =
+            &(GetTensorData<float>(input_box_encodings)[box_encoding_idx]);
+        box_centersize = *reinterpret_cast<const CenterSizeEncoding*>(boxes);
         anchor =
             ReInterpretTensor<const CenterSizeEncoding*>(input_anchors)[idx];
         break;
+      }
       default:
         // Unsupported type.
         return kTfLiteError;
@@ -453,6 +464,8 @@ TfLiteStatus NonMaxSuppressionMultiClassRegularHelper(TfLiteContext* context,
                                                       const float* scores) {
   const TfLiteTensor* input_box_encodings =
       GetInput(context, node, kInputTensorBoxEncodings);
+  const TfLiteTensor* input_class_predictions =
+      GetInput(context, node, kInputTensorClassPredictions);
   const TfLiteTensor* decoded_boxes =
       &context->tensors[op_data->decoded_boxes_index];
 
@@ -469,11 +482,11 @@ TfLiteStatus NonMaxSuppressionMultiClassRegularHelper(TfLiteContext* context,
   const int num_classes = op_data->num_classes;
   const int num_detections_per_class = op_data->detections_per_class;
   const int max_detections = op_data->max_detections;
+  const int num_classes_with_background =
+      input_class_predictions->dims->data[2];
   // The row index offset is 1 if background class is included and 0 otherwise.
-  const int label_offset = 1;
-  TF_LITE_ENSURE(context, label_offset != -1);
+  int label_offset = num_classes_with_background - num_classes;
   TF_LITE_ENSURE(context, num_detections_per_class > 0);
-  const int num_classes_with_background = num_classes + label_offset;
 
   // For each class, perform non-max suppression.
   std::vector<float> class_scores(num_boxes);
@@ -578,6 +591,8 @@ TfLiteStatus NonMaxSuppressionMultiClassFastHelper(TfLiteContext* context,
                                                    const float* scores) {
   const TfLiteTensor* input_box_encodings =
       GetInput(context, node, kInputTensorBoxEncodings);
+  const TfLiteTensor* input_class_predictions =
+      GetInput(context, node, kInputTensorClassPredictions);
   const TfLiteTensor* decoded_boxes =
       &context->tensors[op_data->decoded_boxes_index];
 
@@ -593,11 +608,11 @@ TfLiteStatus NonMaxSuppressionMultiClassFastHelper(TfLiteContext* context,
   const int num_boxes = input_box_encodings->dims->data[1];
   const int num_classes = op_data->num_classes;
   const int max_categories_per_anchor = op_data->max_classes_per_detection;
+  const int num_classes_with_background =
+      input_class_predictions->dims->data[2];
   // The row index offset is 1 if background class is included and 0 otherwise.
-  const int label_offset = 1;
-  TF_LITE_ENSURE(context, (label_offset != -1));
+  int label_offset = num_classes_with_background - num_classes;
   TF_LITE_ENSURE(context, (max_categories_per_anchor > 0));
-  const int num_classes_with_background = num_classes + label_offset;
   const int num_categories_per_anchor =
       std::min(max_categories_per_anchor, num_classes);
   std::vector<float> max_scores;
@@ -670,7 +685,8 @@ TfLiteStatus NonMaxSuppressionMultiClass(TfLiteContext* context,
   const int num_classes_with_background =
       input_class_predictions->dims->data[2];
 
-  TF_LITE_ENSURE(context, (num_classes_with_background == num_classes + 1));
+  TF_LITE_ENSURE(context, (num_classes_with_background - num_classes <= 1));
+  TF_LITE_ENSURE(context, (num_classes_with_background >= num_classes));
 
   const TfLiteTensor* scores;
   switch (input_class_predictions->type) {
diff --git a/tensorflow/lite/kernels/detection_postprocess_test.cc b/tensorflow/lite/kernels/detection_postprocess_test.cc
index a1c061a3cad..250706bea75 100644
--- a/tensorflow/lite/kernels/detection_postprocess_test.cc
+++ b/tensorflow/lite/kernels/detection_postprocess_test.cc
@@ -566,6 +566,231 @@ TEST(DetectionPostprocessOpTest, QuantizedTestRegularNMS) {
   EXPECT_THAT(m.GetOutput4<float>(),
               ElementsAreArray(ArrayFloatNear({2.0}, 1e-1)));
 }
+
+TEST(DetectionPostprocessOpTest, FloatTestwithNoBackgroundClassAndNoKeypoints) {
+  DetectionPostprocessOpModelwithRegularNMS m(
+      {TensorType_FLOAT32, {1, 6, 4}}, {TensorType_FLOAT32, {1, 6, 2}},
+      {TensorType_FLOAT32, {6, 4}}, {TensorType_FLOAT32, {}},
+      {TensorType_FLOAT32, {}}, {TensorType_FLOAT32, {}},
+      {TensorType_FLOAT32, {}}, false);
+
+  // six boxes in center-size encoding
+  m.SetInput1<float>({
+      0.0, 0.0,  0.0, 0.0,  // box #1
+      0.0, 1.0,  0.0, 0.0,  // box #2
+      0.0, -1.0, 0.0, 0.0,  // box #3
+      0.0, 0.0,  0.0, 0.0,  // box #4
+      0.0, 1.0,  0.0, 0.0,  // box #5
+      0.0, 0.0,  0.0, 0.0   // box #6
+  });
+  // class scores - two classes without background
+  m.SetInput2<float>({.9, .8, .75, .72, .6, .5, .93, .95, .5, .4, .3, .2});
+  // six anchors in center-size encoding
+  m.SetInput3<float>({
+      0.5, 0.5,   1.0, 1.0,  // anchor #1
+      0.5, 0.5,   1.0, 1.0,  // anchor #2
+      0.5, 0.5,   1.0, 1.0,  // anchor #3
+      0.5, 10.5,  1.0, 1.0,  // anchor #4
+      0.5, 10.5,  1.0, 1.0,  //  anchor #5
+      0.5, 100.5, 1.0, 1.0   // anchor #6
+  });
+
+  m.Invoke();
+  // detection_boxes
+  // in center-size
+  std::vector<int> output_shape1 = m.GetOutputShape1();
+  EXPECT_THAT(output_shape1, ElementsAre(1, 3, 4));
+  EXPECT_THAT(
+      m.GetOutput1<float>(),
+      ElementsAreArray(ArrayFloatNear(
+          {0.0, 10.0, 1.0, 11.0, 0.0, 0.0, 1.0, 1.0, 0.0, 100.0, 1.0, 101.0},
+          1e-1)));
+  // detection_classes
+  std::vector<int> output_shape2 = m.GetOutputShape2();
+  EXPECT_THAT(output_shape2, ElementsAre(1, 3));
+  EXPECT_THAT(m.GetOutput2<float>(),
+              ElementsAreArray(ArrayFloatNear({1, 0, 0}, 1e-1)));
+  // detection_scores
+  std::vector<int> output_shape3 = m.GetOutputShape3();
+  EXPECT_THAT(output_shape3, ElementsAre(1, 3));
+  EXPECT_THAT(m.GetOutput3<float>(),
+              ElementsAreArray(ArrayFloatNear({0.95, 0.9, 0.3}, 1e-1)));
+  // num_detections
+  std::vector<int> output_shape4 = m.GetOutputShape4();
+  EXPECT_THAT(output_shape4, ElementsAre(1));
+  EXPECT_THAT(m.GetOutput4<float>(),
+              ElementsAreArray(ArrayFloatNear({3.0}, 1e-1)));
+}
+
+TEST(DetectionPostprocessOpTest, FloatTestwithBackgroundClassAndKeypoints) {
+  DetectionPostprocessOpModelwithRegularNMS m(
+      {TensorType_FLOAT32, {1, 6, 5}}, {TensorType_FLOAT32, {1, 6, 3}},
+      {TensorType_FLOAT32, {6, 4}}, {TensorType_FLOAT32, {}},
+      {TensorType_FLOAT32, {}}, {TensorType_FLOAT32, {}},
+      {TensorType_FLOAT32, {}}, false);
+
+  // six boxes in center-size encoding
+  m.SetInput1<float>({
+      0.0, 0.0,  0.0, 0.0, 1.0,  // box #1
+      0.0, 1.0,  0.0, 0.0, 1.0,  // box #2
+      0.0, -1.0, 0.0, 0.0, 1.0,  // box #3
+      0.0, 0.0,  0.0, 0.0, 1.0,  // box #4
+      0.0, 1.0,  0.0, 0.0, 1.0,  // box #5
+      0.0, 0.0,  0.0, 0.0, 1.0,  // box #6
+  });
+  // class scores - two classes with background
+  m.SetInput2<float>({0., .9, .8, 0., .75, .72, 0., .6, .5, 0., .93, .95, 0.,
+                      .5, .4, 0., .3, .2});
+  // six anchors in center-size encoding
+  m.SetInput3<float>({
+      0.5, 0.5,   1.0, 1.0,  // anchor #1
+      0.5, 0.5,   1.0, 1.0,  // anchor #2
+      0.5, 0.5,   1.0, 1.0,  // anchor #3
+      0.5, 10.5,  1.0, 1.0,  // anchor #4
+      0.5, 10.5,  1.0, 1.0,  //  anchor #5
+      0.5, 100.5, 1.0, 1.0   // anchor #6
+  });
+
+  m.Invoke();
+  // detection_boxes
+  // in center-size
+  std::vector<int> output_shape1 = m.GetOutputShape1();
+  EXPECT_THAT(output_shape1, ElementsAre(1, 3, 4));
+  EXPECT_THAT(
+      m.GetOutput1<float>(),
+      ElementsAreArray(ArrayFloatNear(
+          {0.0, 10.0, 1.0, 11.0, 0.0, 0.0, 1.0, 1.0, 0.0, 100.0, 1.0, 101.0},
+          1e-1)));
+  // detection_classes
+  std::vector<int> output_shape2 = m.GetOutputShape2();
+  EXPECT_THAT(output_shape2, ElementsAre(1, 3));
+  EXPECT_THAT(m.GetOutput2<float>(),
+              ElementsAreArray(ArrayFloatNear({1, 0, 0}, 1e-1)));
+  // detection_scores
+  std::vector<int> output_shape3 = m.GetOutputShape3();
+  EXPECT_THAT(output_shape3, ElementsAre(1, 3));
+  EXPECT_THAT(m.GetOutput3<float>(),
+              ElementsAreArray(ArrayFloatNear({0.95, 0.9, 0.3}, 1e-1)));
+  // num_detections
+  std::vector<int> output_shape4 = m.GetOutputShape4();
+  EXPECT_THAT(output_shape4, ElementsAre(1));
+  EXPECT_THAT(m.GetOutput4<float>(),
+              ElementsAreArray(ArrayFloatNear({3.0}, 1e-1)));
+}
+
+TEST(DetectionPostprocessOpTest,
+     QuantizedTestwithNoBackgroundClassAndKeypoints) {
+  DetectionPostprocessOpModelwithRegularNMS m(
+      {TensorType_UINT8, {1, 6, 5}, -1.0, 1.0},
+      {TensorType_UINT8, {1, 6, 2}, 0.0, 1.0},
+      {TensorType_UINT8, {6, 4}, 0.0, 100.5}, {TensorType_FLOAT32, {}},
+      {TensorType_FLOAT32, {}}, {TensorType_FLOAT32, {}},
+      {TensorType_FLOAT32, {}}, false);
+  // six boxes in center-size encoding
+  std::vector<std::vector<float>> inputs1 = {{
+      0.0, 0.0,  0.0, 0.0, 1.0,  // box #1
+      0.0, 1.0,  0.0, 0.0, 1.0,  // box #2
+      0.0, -1.0, 0.0, 0.0, 1.0,  // box #3
+      0.0, 0.0,  0.0, 0.0, 1.0,  // box #4
+      0.0, 1.0,  0.0, 0.0, 1.0,  // box #5
+      0.0, 0.0,  0.0, 0.0, 1.0   // box #6
+  }};
+  m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[0]);
+  // class scores - two classes with background
+  std::vector<std::vector<float>> inputs2 = {
+      {.9, .8, .75, .72, .6, .5, .93, .95, .5, .4, .3, .2}};
+  m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[0]);
+  // six anchors in center-size encoding
+  std::vector<std::vector<float>> inputs3 = {{
+      0.5, 0.5,   1.0, 1.0,  // anchor #1
+      0.5, 0.5,   1.0, 1.0,  // anchor #2
+      0.5, 0.5,   1.0, 1.0,  // anchor #3
+      0.5, 10.5,  1.0, 1.0,  // anchor #4
+      0.5, 10.5,  1.0, 1.0,  // anchor #5
+      0.5, 100.5, 1.0, 1.0   // anchor #6
+  }};
+  m.QuantizeAndPopulate<uint8_t>(m.input3(), inputs3[0]);
+  m.Invoke();
+  // detection_boxes
+  // in center-size
+  std::vector<int> output_shape1 = m.GetOutputShape1();
+  EXPECT_THAT(output_shape1, ElementsAre(1, 3, 4));
+  EXPECT_THAT(
+      m.GetOutput1<float>(),
+      ElementsAreArray(ArrayFloatNear(
+          {0.0, 10.0, 1.0, 11.0, 0.0, 0.0, 1.0, 1.0, 0.0, 100.0, 1.0, 101.0},
+          3e-1)));
+  // detection_classes
+  std::vector<int> output_shape2 = m.GetOutputShape2();
+  EXPECT_THAT(output_shape2, ElementsAre(1, 3));
+  EXPECT_THAT(m.GetOutput2<float>(),
+              ElementsAreArray(ArrayFloatNear({1, 0, 0}, 1e-1)));
+  // detection_scores
+  std::vector<int> output_shape3 = m.GetOutputShape3();
+  EXPECT_THAT(output_shape3, ElementsAre(1, 3));
+  EXPECT_THAT(m.GetOutput3<float>(),
+              ElementsAreArray(ArrayFloatNear({0.95, 0.9, 0.3}, 1e-1)));
+  // num_detections
+  std::vector<int> output_shape4 = m.GetOutputShape4();
+  EXPECT_THAT(output_shape4, ElementsAre(1));
+  EXPECT_THAT(m.GetOutput4<float>(),
+              ElementsAreArray(ArrayFloatNear({3.0}, 1e-1)));
+}
+
+TEST(DetectionPostprocessOpTest, FloatTestwithNoBackgroudClassAndKeypoints) {
+  DetectionPostprocessOpModelwithRegularNMS m(
+      {TensorType_FLOAT32, {1, 6, 5}}, {TensorType_FLOAT32, {1, 6, 2}},
+      {TensorType_FLOAT32, {6, 4}}, {TensorType_FLOAT32, {}},
+      {TensorType_FLOAT32, {}}, {TensorType_FLOAT32, {}},
+      {TensorType_FLOAT32, {}}, false);
+
+  // six boxes in center-size encoding
+  m.SetInput1<float>({
+      0.0, 0.0,  0.0, 0.0, 1.0,  // box #1
+      0.0, 1.0,  0.0, 0.0, 1.0,  // box #2
+      0.0, -1.0, 0.0, 0.0, 1.0,  // box #3
+      0.0, 0.0,  0.0, 0.0, 1.0,  // box #4
+      0.0, 1.0,  0.0, 0.0, 1.0,  // box #5
+      0.0, 0.0,  0.0, 0.0, 1.0,  // box #6
+  });
+  // class scores - two classes with no background
+  m.SetInput2<float>({.9, .8, .75, .72, .6, .5, .93, .95, .5, .4, .3, .2});
+  // six anchors in center-size encoding
+  m.SetInput3<float>({
+      0.5, 0.5,   1.0, 1.0,  // anchor #1
+      0.5, 0.5,   1.0, 1.0,  // anchor #2
+      0.5, 0.5,   1.0, 1.0,  // anchor #3
+      0.5, 10.5,  1.0, 1.0,  // anchor #4
+      0.5, 10.5,  1.0, 1.0,  //  anchor #5
+      0.5, 100.5, 1.0, 1.0   // anchor #6
+  });
+
+  m.Invoke();
+  // detection_boxes
+  // in center-size
+  std::vector<int> output_shape1 = m.GetOutputShape1();
+  EXPECT_THAT(output_shape1, ElementsAre(1, 3, 4));
+  EXPECT_THAT(
+      m.GetOutput1<float>(),
+      ElementsAreArray(ArrayFloatNear(
+          {0.0, 10.0, 1.0, 11.0, 0.0, 0.0, 1.0, 1.0, 0.0, 100.0, 1.0, 101.0},
+          1e-1)));
+  // detection_classes
+  std::vector<int> output_shape2 = m.GetOutputShape2();
+  EXPECT_THAT(output_shape2, ElementsAre(1, 3));
+  EXPECT_THAT(m.GetOutput2<float>(),
+              ElementsAreArray(ArrayFloatNear({1, 0, 0}, 1e-1)));
+  // detection_scores
+  std::vector<int> output_shape3 = m.GetOutputShape3();
+  EXPECT_THAT(output_shape3, ElementsAre(1, 3));
+  EXPECT_THAT(m.GetOutput3<float>(),
+              ElementsAreArray(ArrayFloatNear({0.95, 0.9, 0.3}, 1e-1)));
+  // num_detections
+  std::vector<int> output_shape4 = m.GetOutputShape4();
+  EXPECT_THAT(output_shape4, ElementsAre(1));
+  EXPECT_THAT(m.GetOutput4<float>(),
+              ElementsAreArray(ArrayFloatNear({3.0}, 1e-1)));
+}
 }  // namespace
 }  // namespace custom
 }  // namespace ops
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index 15d5d5579c2..0f363f06a8c 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -167,6 +167,27 @@ config_setting(
     },
 )
 
+cc_library(
+    name = "common",
+    srcs = [],
+    hdrs = ["common.h"],
+    copts = tflite_copts(),
+    deps = [
+        ":types",
+        "@gemmlowp//:fixedpoint",
+    ] + select({
+        ":haswell": tflite_deps_intel,
+        ":ios_x86_64": tflite_deps_intel,
+        ":k8": tflite_deps_intel,
+        ":x86": tflite_deps_intel,
+        ":x86_64": tflite_deps_intel,
+        ":darwin": tflite_deps_intel,
+        ":darwin_x86_64": tflite_deps_intel,
+        ":freebsd": tflite_deps_intel,
+        "//conditions:default": [],
+    }),
+)
+
 cc_library(
     name = "optimized_base",
     srcs = [],
@@ -180,6 +201,7 @@ cc_library(
         "optimized/integer_ops/add.h",
         "optimized/integer_ops/conv.h",
         "optimized/integer_ops/depthwise_conv.h",
+        "optimized/integer_ops/depthwise_conv_3x3_filter.h",
         "optimized/integer_ops/fully_connected.h",
         "optimized/integer_ops/mul.h",
         "optimized/integer_ops/pooling.h",
@@ -196,7 +218,8 @@ cc_library(
         ":tensor",
         ":tensor_utils",
         "//third_party/eigen3",
-        "@gemmlowp",
+        "@gemmlowp//:fixedpoint",
+        "@gemmlowp//:profiler",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/kernels:cpu_backend_context",
         "//tensorflow/lite/kernels:cpu_backend_threadpool",
@@ -359,7 +382,8 @@ cc_library(
         ":strided_slice_logic",
         ":tensor",
         ":types",
-        "@gemmlowp",
+        "@gemmlowp//:fixedpoint",
+        "@gemmlowp//:profiler",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/kernels:op_macros",
     ] + select({
@@ -479,7 +503,8 @@ cc_library(
         "//tensorflow/lite/kernels:activation_functor",
         "//tensorflow/lite/kernels:op_macros",
         "@arm_neon_2_x86_sse",
-        "@gemmlowp",
+        "@gemmlowp//:fixedpoint",
+        "@gemmlowp//:profiler",
     ],
 )
 
@@ -535,7 +560,8 @@ cc_library(
         "//tensorflow/lite/c:c_api_internal",
         "@arm_neon_2_x86_sse",
         "//tensorflow/lite/kernels:op_macros",
-        "@gemmlowp",
+        "@gemmlowp//:fixedpoint",
+        "@gemmlowp//:profiler",
     ] + select({
         ":aarch64": [
             ":neon_tensor_utils",
@@ -642,7 +668,6 @@ cc_test(
         ":types",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
-        "@gemmlowp",
     ],
 )
 
diff --git a/tensorflow/lite/kernels/internal/common.h b/tensorflow/lite/kernels/internal/common.h
index 2b8226c4977..03d71f2b06e 100644
--- a/tensorflow/lite/kernels/internal/common.h
+++ b/tensorflow/lite/kernels/internal/common.h
@@ -87,6 +87,72 @@ float ActivationFunction(float x) {
                                       output_activation_max);
 }
 
+inline void BiasAndClamp(float clamp_min, float clamp_max, int bias_size,
+                         const float* bias_data, int array_size,
+                         float* array_data) {
+  // Note: see b/132215220: in May 2019 we thought it would be OK to replace
+  // this with the Eigen one-liner:
+  //   return (array.colwise() + bias).cwiseMin(clamp_max).cwiseMin(clamp_max).
+  // This turned out to severely regress performance: +4ms (i.e. 8%) on
+  // MobileNet v2 / 1.0 / 224. So we keep custom NEON code for now.
+  TFLITE_DCHECK_EQ((array_size % bias_size), 0);
+#ifdef USE_NEON
+  float* array_ptr = array_data;
+  float* array_end_ptr = array_ptr + array_size;
+  const auto clamp_min_vec = vdupq_n_f32(clamp_min);
+  const auto clamp_max_vec = vdupq_n_f32(clamp_max);
+  for (; array_ptr != array_end_ptr; array_ptr += bias_size) {
+    int i = 0;
+    for (; i <= bias_size - 16; i += 16) {
+      auto b0 = vld1q_f32(bias_data + i);
+      auto b1 = vld1q_f32(bias_data + i + 4);
+      auto b2 = vld1q_f32(bias_data + i + 8);
+      auto b3 = vld1q_f32(bias_data + i + 12);
+      auto a0 = vld1q_f32(array_ptr + i);
+      auto a1 = vld1q_f32(array_ptr + i + 4);
+      auto a2 = vld1q_f32(array_ptr + i + 8);
+      auto a3 = vld1q_f32(array_ptr + i + 12);
+      auto x0 = vaddq_f32(a0, b0);
+      auto x1 = vaddq_f32(a1, b1);
+      auto x2 = vaddq_f32(a2, b2);
+      auto x3 = vaddq_f32(a3, b3);
+      x0 = vmaxq_f32(clamp_min_vec, x0);
+      x1 = vmaxq_f32(clamp_min_vec, x1);
+      x2 = vmaxq_f32(clamp_min_vec, x2);
+      x3 = vmaxq_f32(clamp_min_vec, x3);
+      x0 = vminq_f32(clamp_max_vec, x0);
+      x1 = vminq_f32(clamp_max_vec, x1);
+      x2 = vminq_f32(clamp_max_vec, x2);
+      x3 = vminq_f32(clamp_max_vec, x3);
+      vst1q_f32(array_ptr + i, x0);
+      vst1q_f32(array_ptr + i + 4, x1);
+      vst1q_f32(array_ptr + i + 8, x2);
+      vst1q_f32(array_ptr + i + 12, x3);
+    }
+    for (; i <= bias_size - 4; i += 4) {
+      auto b = vld1q_f32(bias_data + i);
+      auto a = vld1q_f32(array_ptr + i);
+      auto x = vaddq_f32(a, b);
+      x = vmaxq_f32(clamp_min_vec, x);
+      x = vminq_f32(clamp_max_vec, x);
+      vst1q_f32(array_ptr + i, x);
+    }
+    for (; i < bias_size; i++) {
+      array_ptr[i] = ActivationFunctionWithMinMax(array_ptr[i] + bias_data[i],
+                                                  clamp_min, clamp_max);
+    }
+  }
+#else  // not NEON
+  for (int array_offset = 0; array_offset < array_size;
+       array_offset += bias_size) {
+    for (int i = 0; i < bias_size; i++) {
+      array_data[array_offset + i] = ActivationFunctionWithMinMax(
+          array_data[array_offset + i] + bias_data[i], clamp_min, clamp_max);
+    }
+  }
+#endif
+}
+
 inline int32 MultiplyByQuantizedMultiplierSmallerThanOneExp(
     int32 x, int32 quantized_multiplier, int left_shift) {
   using gemmlowp::RoundingDivideByPOT;
@@ -547,6 +613,95 @@ inline void NdArrayDescsForElementwiseBroadcast(
   }
 }
 
+// Copied from gemmlowp::RoundDown when we dropped direct dependency on
+// gemmlowp.
+//
+// Returns the runtime argument rounded down to the nearest multiple of
+// the fixed Modulus.
+template <unsigned Modulus, typename Integer>
+Integer RoundDown(Integer i) {
+  return i - (i % Modulus);
+}
+
+// Copied from gemmlowp::RoundUp when we dropped direct dependency on
+// gemmlowp.
+//
+// Returns the runtime argument rounded up to the nearest multiple of
+// the fixed Modulus.
+template <unsigned Modulus, typename Integer>
+Integer RoundUp(Integer i) {
+  return RoundDown<Modulus>(i + Modulus - 1);
+}
+
+// Copied from gemmlowp::CeilQuotient when we dropped direct dependency on
+// gemmlowp.
+//
+// Returns the quotient a / b rounded up ('ceil') to the nearest integer.
+template <typename Integer>
+Integer CeilQuotient(Integer a, Integer b) {
+  return (a + b - 1) / b;
+}
+
+// This function is a copy of gemmlowp::HowManyThreads, copied when we dropped
+// the direct dependency of internal/optimized/ on gemmlowp.
+//
+// It computes a reasonable number of threads to use for a GEMM of shape
+// (rows, cols, depth).
+//
+// TODO(b/131910176): get rid of this function by switching each call site
+// to its own more sensible logic for its own workload.
+template <int KernelRows>
+inline int LegacyHowManyThreads(int max_num_threads, int rows, int cols,
+                                int depth) {
+  // Early-exit in the default case where multi-threading is disabled.
+  if (max_num_threads == 1) {
+    return 1;
+  }
+
+  // Ensure that each thread has KernelRows rows to process, if at all possible.
+  int thread_count = std::min(max_num_threads, rows / KernelRows);
+
+  // Limit the number of threads according to the overall size of the problem.
+  if (thread_count > 1) {
+    // Empirically determined value.
+    static constexpr std::uint64_t min_cubic_size_per_thread = 64 * 1024;
+
+    // We can only multiply two out of three sizes without risking overflow
+    const std::uint64_t cubic_size =
+        std::uint64_t(rows) * std::uint64_t(cols) * std::uint64_t(depth);
+
+    thread_count = std::min(
+        thread_count, static_cast<int>(cubic_size / min_cubic_size_per_thread));
+  }
+
+  if (thread_count < 1) {
+    thread_count = 1;
+  }
+
+  assert(thread_count > 0 && thread_count <= max_num_threads);
+  return thread_count;
+}
+
+template <typename T>
+void optimized_ops_preload_l1_stream(const T* ptr) {
+#ifdef __GNUC__
+  // builtin offered by GCC-compatible compilers including clang
+  __builtin_prefetch(ptr, /* 0 means read */ 0, /* 0 means no locality */ 0);
+#else
+  (void)ptr;
+#endif
+}
+
+template <typename T>
+void optimized_ops_preload_l1_keep(const T* ptr) {
+#ifdef __GNUC__
+  // builtin offered by GCC-compatible compilers including clang
+  __builtin_prefetch(ptr, /* 0 means read */ 0, /* 3 means high locality */ 3);
+#else
+  (void)ptr;
+#endif
+}
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_COMMON_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h
index 9dec9d8928b..2ccc406998c 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_3X3_FILTER_COMMON_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_3X3_FILTER_COMMON_H_
 
-#include "public/gemmlowp.h"
+#include "profiling/instrumentation.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
 #include "tensorflow/lite/kernels/internal/types.h"
@@ -319,12 +319,20 @@ template <DepthwiseConvOutputRounding output_rounding, int32 kDepth,
           int32 kStrideWidth, int32 kStrideHeight>
 struct DepthwiseConvWindow {};
 
+template <DepthwiseConvOutputRounding output_rounding, int32 kDepth,
+          int32 kStrideWidth, int32 kStrideHeight>
+struct DepthwiseConvWindowPerChannel {};
+
 enum class EdgeType { kCorner, kHorizontal, kVertical, kCenter };
 
 template <DepthwiseConvOutputRounding output_rounding, EdgeType kEdgeType,
           int kPadWidth, int kPadHeight>
 struct DepthwiseConvPartial {};
 
+template <DepthwiseConvOutputRounding output_rounding, EdgeType kEdgeType,
+          int kPadWidth, int kPadHeight>
+struct DepthwiseConvPartialPerChannel {};
+
 // Copies a subset of the input designated by |input_ptr| into |output_ptr|
 // with the specified output dimensions. Supports output depths of 64 only as
 // this is the cache line size.
@@ -367,12 +375,19 @@ struct ShuffleParams {
         input_height(get_shuffle_input_size(stride_height, output_height)) {}
 };
 
+enum class QuantizationType {
+  kNonPerChannelUint8 = 0,
+  kPerChannelInt8 = 1,
+};
+
+template <
+    QuantizationType quantization_type = QuantizationType::kNonPerChannelUint8>
 inline bool Fast3x3FilterKernelSupported(
     const RuntimeShape& input_shape, const RuntimeShape& filter_shape,
     int32 stride_width, int32 stride_height, int32 dilation_width_factor,
     int32 dilation_height_factor, int32 pad_width, int32 pad_height,
     int32 depth_multiplier, const RuntimeShape& output_shape,
-    int32 output_shift) {
+    int32 output_shift, const int32* output_shift_ptr = nullptr) {
   const int32 input_height = input_shape.Dims(1);
   const int32 input_width = input_shape.Dims(2);
   const int32 input_depth = input_shape.Dims(3);
@@ -380,6 +395,7 @@ inline bool Fast3x3FilterKernelSupported(
   const int32 filter_width = filter_shape.Dims(2);
   const int32 output_height = output_shape.Dims(1);
   const int32 output_width = output_shape.Dims(2);
+  const int32 output_depth = output_shape.Dims(3);
 
   bool supported =
       filter_width == 3 && filter_height == 3 && depth_multiplier == 1 &&
@@ -394,6 +410,14 @@ inline bool Fast3x3FilterKernelSupported(
     return false;
   }
 
+  if (quantization_type == QuantizationType::kPerChannelInt8) {
+    for (int i = 0; i < output_depth; ++i) {
+      if (output_shift_ptr[i] <= 0) {
+        return false;
+      }
+    }
+  }
+
   // Handle case where padding is zero but padding type is not kValid.
   // This would require special boundary case handling that is not supported.
 
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h
index c77715de579..64e5898e711 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
 
-#include "public/gemmlowp.h"
+#include "profiling/instrumentation.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
index b7a307b1f23..e100660df34 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
@@ -15,8 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
 
-#include "fixedpoint/fixedpoint.h"
-#include "public/gemmlowp.h"
+#include "profiling/instrumentation.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
 #include "tensorflow/lite/kernels/internal/common.h"
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
index cc2f4968e1c..eff9f242aae 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
@@ -17,8 +17,7 @@ limitations under the License.
 
 #include <memory>
 
-#include "fixedpoint/fixedpoint.h"
-#include "public/gemmlowp.h"
+#include "profiling/instrumentation.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h"
 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
@@ -5796,121 +5795,104 @@ struct ProcessPerDepth<DepthwiseConvImplementation::kUseNeon3x3DotProduct> {
       const uint8* filter_data, const int32* bias_data,
       int8* shuffled_filter_data, int32* adjusted_bias_data,
       const DepthwiseConvDotProdParams* function_params) {
-    const int depth = function_params->output_depth;
-    const int depth_micro_repeats = function_params->depth_micro_repeats;
-    const int bias_increment = function_params->bias_increment;
+    // Note that argument registers may be reused after parameter loading.
+    // x0 %[filter_data]
+    // x1 %[bias_data]
+    // x2 %[shuffled_filter_data]
+    // x3 %[adjusted_bias_data]
+    // x4 %[function_params]
 
-    constexpr int kSymmetricZeroPoint = 128;
-    constexpr uint8 kSignBit = 0x80;
-    const int32 input_offset = function_params->input_offset;
-    TFLITE_DCHECK_GE(input_offset, -255);
-    TFLITE_DCHECK_LE(input_offset, 0);
-    const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
-    const int8x16_t ones_vector = vdupq_n_s8(1);
-
-    // Simulate NEON-register transposition of subset of filter.
-    int8x16_t input_0_a;
-    int8x16_t input_0_b;
-    int8x16_t input_0_c;
-    int8x16_t input_1_a;
-    int8x16_t input_1_b;
-    int8x16_t input_1_c;
-    int8x16_t input_2_a;
-    int8x16_t input_2_b;
-    int8x16_t input_2_c;
-
-    int8x16_t filter_0_a;
-    int8x16_t filter_0_b;
-    int8x16_t filter_1_a;
-    int8x16_t filter_1_b;
-    int8x16_t filter_2_a;
-    int8x16_t filter_2_b;
-
-    // Register pairs for each height.
-    // Effect subtraction of zero-point = 128 by XOR of sign bit.
-    const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
-
-    const uint8* filter_block = filter_data;
-    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
-      // Filter data is provided as filter_block[3][3][depth/8][2][4].
-      // height 3, width 3, micro-blocks, sub-block 0 or 1, depth 4.
-      // filter_bank[3][2][4][4]; Sub-block, height 3, depth 4, width 4.
-
-      const uint8* filter_block_ptr = filter_block;
-      input_0_a = vld1q_lane_s8x8(filter_block_ptr, input_0_a, 0);
-      filter_block_ptr += depth;
-      input_0_b = vld1q_lane_s8x8(filter_block_ptr, input_0_b, 0);
-      filter_block_ptr += depth;
-      input_0_c = vld1q_lane_s8x8(filter_block_ptr, input_0_c, 0);
-      filter_block_ptr += depth;
-      input_1_a = vld1q_lane_s8x8(filter_block_ptr, input_1_a, 0);
-      filter_block_ptr += depth;
-      input_1_b = vld1q_lane_s8x8(filter_block_ptr, input_1_b, 0);
-      filter_block_ptr += depth;
-      input_1_c = vld1q_lane_s8x8(filter_block_ptr, input_1_c, 0);
-      filter_block_ptr += depth;
-      input_2_a = vld1q_lane_s8x8(filter_block_ptr, input_2_a, 0);
-      filter_block_ptr += depth;
-      input_2_b = vld1q_lane_s8x8(filter_block_ptr, input_2_b, 0);
-      filter_block_ptr += depth;
-      input_2_c = vld1q_lane_s8x8(filter_block_ptr, input_2_c, 0);
-
-      filter_0_a = vzip1q_s8(input_0_a, input_0_b);
-      filter_0_b = vzip1q_s8(input_0_c, sign_bit);
-      filter_1_a = vzip1q_s8(input_1_a, input_1_b);
-      filter_1_b = vzip1q_s8(input_1_c, sign_bit);
-      filter_2_a = vzip1q_s8(input_2_a, input_2_b);
-      filter_2_b = vzip1q_s8(input_2_c, sign_bit);
-      filter_0_a = veorq_s8(filter_0_a, sign_bit);
-      filter_0_b = veorq_s8(filter_0_b, sign_bit);
-      filter_1_a = veorq_s8(filter_1_a, sign_bit);
-      filter_1_b = veorq_s8(filter_1_b, sign_bit);
-      filter_2_a = veorq_s8(filter_2_a, sign_bit);
-      filter_2_b = veorq_s8(filter_2_b, sign_bit);
-      vzipq_s8x2_in_place(&filter_0_a, &filter_0_b);
-      vzipq_s8x2_in_place(&filter_1_a, &filter_1_b);
-      vzipq_s8x2_in_place(&filter_2_a, &filter_2_b);
-
-      vst1q_s8(shuffled_filter_data, filter_0_a);
-      shuffled_filter_data += 16;
-      vst1q_s8(shuffled_filter_data, filter_0_b);
-      shuffled_filter_data += 16;
-      vst1q_s8(shuffled_filter_data, filter_1_a);
-      shuffled_filter_data += 16;
-      vst1q_s8(shuffled_filter_data, filter_1_b);
-      shuffled_filter_data += 16;
-      vst1q_s8(shuffled_filter_data, filter_2_a);
-      shuffled_filter_data += 16;
-      vst1q_s8(shuffled_filter_data, filter_2_b);
-      shuffled_filter_data += 16;
-
-      int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
-      bias_data += bias_increment;
-      int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
-      bias_data += bias_increment;
-      // For instance, if input_offset == 128, no adjustment is needed.
-
-      int32x4_t filter_sum_a = vdupq_n_s32(0);
-      filter_sum_a = vdotq_s32(filter_sum_a, filter_0_a, ones_vector);
-      filter_sum_a = vdotq_s32(filter_sum_a, filter_1_a, ones_vector);
-      filter_sum_a = vdotq_s32(filter_sum_a, filter_2_a, ones_vector);
-      int32x4_t filter_sum_b = vdupq_n_s32(0);
-      filter_sum_b = vdotq_s32(filter_sum_b, filter_0_b, ones_vector);
-      filter_sum_b = vdotq_s32(filter_sum_b, filter_1_b, ones_vector);
-      filter_sum_b = vdotq_s32(filter_sum_b, filter_2_b, ones_vector);
-
-      adjusted_bias_data_a = vmlaq_n_s32(adjusted_bias_data_a, filter_sum_a,
-                                         input_offset_difference);
-      adjusted_bias_data_b = vmlaq_n_s32(adjusted_bias_data_b, filter_sum_b,
-                                         input_offset_difference);
-
-      vst1q_s32(adjusted_bias_data, adjusted_bias_data_a);
-      adjusted_bias_data += 4;
-      vst1q_s32(adjusted_bias_data, adjusted_bias_data_b);
-      adjusted_bias_data += 4;
-
-      filter_block += 8;
-    }
+    asm volatile(
+        // %bb.0:
+        "ldp    w12, w11, [%[function_params], #" STR(DP_OFFSET_BIAS_INCREMENT) "]\n"
+        "ldrsw  x9, [%[function_params], #" STR(DP_OFFSET_OUTPUT_DEPTH) "]\n"
+        "ldr    w10, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
+        "mov    x8, xzr\n"
+        "add    w11, w11, #128\n"  // =128
+        "sxtw   x12, w12\n"
+        "movi   v0.16b, #128\n"
+        "dup    v1.4s, w11\n"
+        "lsl    x11, x12, #3\n"
+        "lsl    x12, x12, #2\n"
+        "movi   v2.16b, #1\n"
+        // implicit-def: $q3
+        // implicit-def: $q4
+        // implicit-def: $q5
+        // implicit-def: $q6
+        // implicit-def: $q7
+        // implicit-def: $q16
+        // implicit-def: $q17
+        // implicit-def: $q18
+        // implicit-def: $q19
+        "b      DC_PER_DEPTH_2\n"
+        "   DC_PER_DEPTH_1:\n"  // in Loop: Header=BB177_2 Depth=1
+        "add    x13, %[filter_data], x8, lsl #3\n"
+        "ld1    { v19.d }[0], [x13], x9\n"
+        "movi   v21.2d, #0\n"
+        "movi   v20.2d, #0\n"
+        "add    x8, x8, #1\n"  // =1
+        "ld1    { v18.d }[0], [x13], x9\n"
+        "ld1    { v17.d }[0], [x13], x9\n"
+        "zip1   v22.16b, v19.16b, v18.16b\n"
+        "eor    v22.16b, v22.16b, v0.16b\n"
+        "ld1    { v16.d }[0], [x13], x9\n"
+        "zip1   v23.16b, v17.16b, v0.16b\n"
+        "eor    v23.16b, v23.16b, v0.16b\n"
+        "zip1   v24.8h, v22.8h, v23.8h\n"
+        "ld1    { v7.d }[0], [x13], x9\n"
+        "zip2   v22.8h, v22.8h, v23.8h\n"
+        "sdot   v21.4s, v22.16b, v2.16b\n"
+        "sdot   v20.4s, v24.16b, v2.16b\n"
+        "ld1    { v6.d }[0], [x13], x9\n"
+        "zip1   v23.16b, v16.16b, v7.16b\n"
+        "eor    v23.16b, v23.16b, v0.16b\n"
+        "ld1    { v5.d }[0], [x13], x9\n"
+        "zip1   v25.16b, v6.16b, v0.16b\n"
+        "eor    v25.16b, v25.16b, v0.16b\n"
+        "zip1   v26.8h, v23.8h, v25.8h\n"
+        "ld1    { v4.d }[0], [x13], x9\n"
+        "zip2   v23.8h, v23.8h, v25.8h\n"
+        "sdot   v21.4s, v23.16b, v2.16b\n"
+        "sdot   v20.4s, v26.16b, v2.16b\n"
+        "ld1    { v3.d }[0], [x13]\n"
+        "zip1   v25.16b, v5.16b, v4.16b\n"
+        "stp    q26, q23, [%[shuffled_filter_data], #32]\n"
+        "stp    q24, q22, [%[shuffled_filter_data]]\n"
+        "zip1   v23.16b, v3.16b, v0.16b\n"
+        "eor    v22.16b, v25.16b, v0.16b\n"
+        "eor    v23.16b, v23.16b, v0.16b\n"
+        "zip1   v24.8h, v22.8h, v23.8h\n"
+        "zip2   v22.8h, v22.8h, v23.8h\n"
+        "stp    q24, q22, [%[shuffled_filter_data], #64]\n"
+        "sdot   v21.4s, v22.16b, v2.16b\n"
+        "ldr    q22, [%[bias_data]]\n"
+        "ldr    q23, [%[bias_data], x12]\n"
+        "sdot   v20.4s, v24.16b, v2.16b\n"
+        "add    %[shuffled_filter_data], x2, #96\n"  // =96
+        "mla    v22.4s, v20.4s, v1.4s\n"
+        "mla    v23.4s, v21.4s, v1.4s\n"
+        "add    %[bias_data], x1, x11\n"
+        "stp    q22, q23, [%[adjusted_bias_data]], #32\n"
+        "   DC_PER_DEPTH_2:\n"  // =>This Inner Loop Header: Depth=1
+        "cmp    w8, w10\n"
+        "b.lt   DC_PER_DEPTH_1\n"
+        :
+        // Outputs.
+        [ filter_data ] "+r"(filter_data),
+        [ bias_data ] "+r"(bias_data),
+        [ shuffled_filter_data ] "+r"(shuffled_filter_data),
+        [ adjusted_bias_data ] "+r"(adjusted_bias_data)
+        :
+        // Inputs.
+        [ function_params ] "r"(function_params)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+        "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+        // We use these general-purpose registers.
+        "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x15", "x16");
   }
 
   static inline void Run(const uint8* filter_data, const int32* bias_data,
@@ -6769,10 +6751,9 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
       TFLITE_DCHECK_EQ(start_width, 1);
       TFLITE_DCHECK(leading_width_padding);
       TFLITE_DCHECK(trailing_width_padding);
-      // ASM should use MOVI 64-bit set.
-      padding_mask = vcreate_u64(~0xffffff00L);
 
       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        half_work_reg = vdup_n_u8(-input_offset);
         half_work_reg = vld1_lane_s8(reinterpret_cast<const int8*>(
                                          input_block_data + input_block_offset),
                                      half_work_reg, 1);
@@ -6784,8 +6765,6 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
             vld1_lane_s8(reinterpret_cast<const int8*>(input_block_data +
                                                        input_block_offset + 2),
                          half_work_reg, 3);
-        half_work_reg =
-            vbsl_s8(padding_mask, vget_low_s8(padding_reg), half_work_reg);
 
         half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
         TFLITE_DCHECK_EQ(scratch_data_offset % 8, 0);
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h
index 991228b1f4e..d23b88cb247 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h
@@ -24,7 +24,6 @@ limitations under the License.
 
 #include <algorithm>
 
-#include "fixedpoint/fixedpoint.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
@@ -294,12 +293,22 @@ struct ProcessPerDepth<
     const int8x16_t ones_vector = vdupq_n_s8(1);
 
     // Simulate NEON-register transposition of subset of filter.
-    int8x16_t filter_reg_0_a;
-    int8x16_t filter_reg_0_b;
-    int8x16_t filter_reg_1_a;
-    int8x16_t filter_reg_1_b;
-    int8x16_t filter_reg_2_a;
-    int8x16_t filter_reg_2_b;
+    int8x16_t input_0_a;
+    int8x16_t input_0_b;
+    int8x16_t input_0_c;
+    int8x16_t input_1_a;
+    int8x16_t input_1_b;
+    int8x16_t input_1_c;
+    int8x16_t input_2_a;
+    int8x16_t input_2_b;
+    int8x16_t input_2_c;
+
+    int8x16_t filter_0_a;
+    int8x16_t filter_0_b;
+    int8x16_t filter_1_a;
+    int8x16_t filter_1_b;
+    int8x16_t filter_2_a;
+    int8x16_t filter_2_b;
 
     // Register pairs for each height.
     // Effect subtraction of zero-point = 128 by XOR of sign bit.
@@ -311,56 +320,52 @@ struct ProcessPerDepth<
       // height 3, width 3, micro-blocks, sub-block 0 or 1, depth 4.
       // filter_bank[3][2][4][4]; Sub-block, height 3, depth 4, width 4.
 
-      // Load zero-point into effective position of zero-padding of filter
-      // (register B, upper part).
-      filter_reg_0_b = vdupq_n_u8(kSignBit);
-      filter_reg_1_b = vdupq_n_u8(kSignBit);
-      filter_reg_2_b = vdupq_n_u8(kSignBit);
-
       const uint8* filter_block_ptr = filter_block;
-      filter_reg_0_a = vld1q_lane_s8x8(filter_block_ptr, filter_reg_0_a, 0);
+      input_0_a = vld1q_lane_s8x8(filter_block_ptr, input_0_a, 0);
       filter_block_ptr += depth;
-      filter_reg_0_b = vld1q_lane_s8x8(filter_block_ptr, filter_reg_0_b, 0);
+      input_0_b = vld1q_lane_s8x8(filter_block_ptr, input_0_b, 0);
       filter_block_ptr += depth;
-      filter_reg_0_a = vld1q_lane_s8x8(filter_block_ptr, filter_reg_0_a, 1);
+      input_0_c = vld1q_lane_s8x8(filter_block_ptr, input_0_c, 0);
       filter_block_ptr += depth;
-      filter_reg_1_a = vld1q_lane_s8x8(filter_block_ptr, filter_reg_1_a, 0);
+      input_1_a = vld1q_lane_s8x8(filter_block_ptr, input_1_a, 0);
       filter_block_ptr += depth;
-      filter_reg_1_b = vld1q_lane_s8x8(filter_block_ptr, filter_reg_1_b, 0);
+      input_1_b = vld1q_lane_s8x8(filter_block_ptr, input_1_b, 0);
       filter_block_ptr += depth;
-      filter_reg_1_a = vld1q_lane_s8x8(filter_block_ptr, filter_reg_1_a, 1);
+      input_1_c = vld1q_lane_s8x8(filter_block_ptr, input_1_c, 0);
       filter_block_ptr += depth;
-      filter_reg_2_a = vld1q_lane_s8x8(filter_block_ptr, filter_reg_2_a, 0);
+      input_2_a = vld1q_lane_s8x8(filter_block_ptr, input_2_a, 0);
       filter_block_ptr += depth;
-      filter_reg_2_b = vld1q_lane_s8x8(filter_block_ptr, filter_reg_2_b, 0);
+      input_2_b = vld1q_lane_s8x8(filter_block_ptr, input_2_b, 0);
       filter_block_ptr += depth;
-      filter_reg_2_a = vld1q_lane_s8x8(filter_block_ptr, filter_reg_2_a, 1);
+      input_2_c = vld1q_lane_s8x8(filter_block_ptr, input_2_c, 0);
 
-      filter_reg_0_a = veorq_s8(filter_reg_0_a, sign_bit);
-      filter_reg_0_b = veorq_s8(filter_reg_0_b, sign_bit);
-      filter_reg_1_a = veorq_s8(filter_reg_1_a, sign_bit);
-      filter_reg_1_b = veorq_s8(filter_reg_1_b, sign_bit);
-      filter_reg_2_a = veorq_s8(filter_reg_2_a, sign_bit);
-      filter_reg_2_b = veorq_s8(filter_reg_2_b, sign_bit);
+      filter_0_a = vzip1q_s8(input_0_a, input_0_b);
+      filter_0_b = vzip1q_s8(input_0_c, sign_bit);
+      filter_1_a = vzip1q_s8(input_1_a, input_1_b);
+      filter_1_b = vzip1q_s8(input_1_c, sign_bit);
+      filter_2_a = vzip1q_s8(input_2_a, input_2_b);
+      filter_2_b = vzip1q_s8(input_2_c, sign_bit);
+      filter_0_a = veorq_s8(filter_0_a, sign_bit);
+      filter_0_b = veorq_s8(filter_0_b, sign_bit);
+      filter_1_a = veorq_s8(filter_1_a, sign_bit);
+      filter_1_b = veorq_s8(filter_1_b, sign_bit);
+      filter_2_a = veorq_s8(filter_2_a, sign_bit);
+      filter_2_b = veorq_s8(filter_2_b, sign_bit);
+      vzipq_s8x2_in_place(&filter_0_a, &filter_0_b);
+      vzipq_s8x2_in_place(&filter_1_a, &filter_1_b);
+      vzipq_s8x2_in_place(&filter_2_a, &filter_2_b);
 
-      vzipq_s8_in_place(&filter_reg_0_a, &filter_reg_0_b);
-      vzipq_s8_in_place(&filter_reg_1_a, &filter_reg_1_b);
-      vzipq_s8_in_place(&filter_reg_2_a, &filter_reg_2_b);
-      vzipq_s8x2_in_place(&filter_reg_0_a, &filter_reg_0_b);
-      vzipq_s8x2_in_place(&filter_reg_1_a, &filter_reg_1_b);
-      vzipq_s8x2_in_place(&filter_reg_2_a, &filter_reg_2_b);
-
-      vst1q_s8(shuffled_filter_data, filter_reg_0_a);
+      vst1q_s8(shuffled_filter_data, filter_0_a);
       shuffled_filter_data += 16;
-      vst1q_s8(shuffled_filter_data, filter_reg_0_b);
+      vst1q_s8(shuffled_filter_data, filter_0_b);
       shuffled_filter_data += 16;
-      vst1q_s8(shuffled_filter_data, filter_reg_1_a);
+      vst1q_s8(shuffled_filter_data, filter_1_a);
       shuffled_filter_data += 16;
-      vst1q_s8(shuffled_filter_data, filter_reg_1_b);
+      vst1q_s8(shuffled_filter_data, filter_1_b);
       shuffled_filter_data += 16;
-      vst1q_s8(shuffled_filter_data, filter_reg_2_a);
+      vst1q_s8(shuffled_filter_data, filter_2_a);
       shuffled_filter_data += 16;
-      vst1q_s8(shuffled_filter_data, filter_reg_2_b);
+      vst1q_s8(shuffled_filter_data, filter_2_b);
       shuffled_filter_data += 16;
 
       int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
@@ -370,13 +375,13 @@ struct ProcessPerDepth<
       // For instance, if input_offset == 128, no adjustment is needed.
 
       int32x4_t filter_sum_a = vdupq_n_s32(0);
-      filter_sum_a = vdotq_s32(filter_sum_a, filter_reg_0_a, ones_vector);
-      filter_sum_a = vdotq_s32(filter_sum_a, filter_reg_1_a, ones_vector);
-      filter_sum_a = vdotq_s32(filter_sum_a, filter_reg_2_a, ones_vector);
+      filter_sum_a = vdotq_s32(filter_sum_a, filter_0_a, ones_vector);
+      filter_sum_a = vdotq_s32(filter_sum_a, filter_1_a, ones_vector);
+      filter_sum_a = vdotq_s32(filter_sum_a, filter_2_a, ones_vector);
       int32x4_t filter_sum_b = vdupq_n_s32(0);
-      filter_sum_b = vdotq_s32(filter_sum_b, filter_reg_0_b, ones_vector);
-      filter_sum_b = vdotq_s32(filter_sum_b, filter_reg_1_b, ones_vector);
-      filter_sum_b = vdotq_s32(filter_sum_b, filter_reg_2_b, ones_vector);
+      filter_sum_b = vdotq_s32(filter_sum_b, filter_0_b, ones_vector);
+      filter_sum_b = vdotq_s32(filter_sum_b, filter_1_b, ones_vector);
+      filter_sum_b = vdotq_s32(filter_sum_b, filter_2_b, ones_vector);
 
       adjusted_bias_data_a = vmlaq_n_s32(adjusted_bias_data_a, filter_sum_a,
                                          input_offset_difference);
@@ -2049,10 +2054,9 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
       TFLITE_DCHECK_EQ(start_width, 1);
       TFLITE_DCHECK(leading_width_padding);
       TFLITE_DCHECK(trailing_width_padding);
-      // ASM should use MOVI 64-bit set.
-      padding_mask = vcreate_u64(~0xffffff00L);
 
       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        half_work_reg = vdup_n_u8(-input_offset);
         half_work_reg = vld1_lane_s8(reinterpret_cast<const int8*>(
                                          input_block_data + input_block_offset),
                                      half_work_reg, 1);
@@ -2064,8 +2068,6 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
             vld1_lane_s8(reinterpret_cast<const int8*>(input_block_data +
                                                        input_block_offset + 2),
                          half_work_reg, 3);
-        half_work_reg =
-            vbsl_s8(padding_mask, vget_low_s8(padding_reg), half_work_reg);
 
         half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
         TFLITE_DCHECK_EQ(scratch_data_offset % 8, 0);
diff --git a/tensorflow/lite/kernels/internal/optimized/im2col_utils.h b/tensorflow/lite/kernels/internal/optimized/im2col_utils.h
index e3600a783ff..e6dd6f8e05f 100644
--- a/tensorflow/lite/kernels/internal/optimized/im2col_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/im2col_utils.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_IM2COL_UTILS_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_IM2COL_UTILS_H_
 
-#include "public/gemmlowp.h"
+#include "profiling/instrumentation.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
index da839ca98db..2d6362a6aa0 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_ADD_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_ADD_H_
 
-#include "public/gemmlowp.h"
+#include "profiling/instrumentation.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/conv.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/conv.h
index ef1f66482d1..2c67b97a645 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/conv.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/conv.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_CONV_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_CONV_H_
 
+#include "profiling/instrumentation.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
index 178ddb6da38..1edf7362813 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
@@ -15,11 +15,12 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_H_
 
-#include "fixedpoint/fixedpoint.h"
-#include "public/gemmlowp.h"
+#include "profiling/instrumentation.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
 #include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h"
+#include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h"
 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h
new file mode 100644
index 00000000000..b14371ff027
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h
@@ -0,0 +1,2957 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_3X3_FILTER_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_3X3_FILTER_H_
+
+#include <memory>
+
+#include "profiling/instrumentation.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_ops {
+namespace depthwise_conv {
+
+#ifdef USE_NEON
+
+#define STR(s) STR_UNEXPANDED(s)
+#define STR_UNEXPANDED(s) #s
+
+// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
+// Jetson TX-2. This compiler does not support the offsetof() macro.
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+#include <stddef.h>
+
+// Represents the number of bytes offset from the start of the
+// DepthwiseConvParams struct. This is used in the asm to load parameters.
+// Keep these values in sync with the static_asserts below.
+#define OFFSET_INPUT_DEPTH 0
+#define OFFSET_INPUT_ROW_SIZE 8
+#define OFFSET_OUTPUT_DEPTH 16
+#define OFFSET_OUTPUT_ROW_SIZE 24
+#define OFFSET_FILTER_ROW_SIZE 32
+#define OFFSET_INPUT_OFFSET 40
+#define OFFSET_OUTPUT_OFFSET 44
+#define OFFSET_FILTER_OFFSET 48
+#define OFFSET_OUTPUT_MULTIPLIER 52
+#define OFFSET_OUTPUT_ACTIVATION_MIN 56
+#define OFFSET_OUTPUT_ACTIVATION_MAX 60
+#define OFFSET_OUTPUT_RIGHT_SHIFT 64
+#define OFFSET_INPUT_WIDTH 68
+#define OFFSET_INPUT_HEIGHT 72
+#define OFFSET_STRIDE_WIDTH 76
+#define OFFSET_STRIDE_HEIGHT 80
+#define OFFSET_OUTPUT_WIDTH 84
+#define OFFSET_OUTPUT_HEIGHT 88
+
+static_assert(offsetof(DepthwiseConvParams, input_depth) == OFFSET_INPUT_DEPTH,
+              "");
+static_assert(offsetof(DepthwiseConvParams, input_row_size) ==
+                  OFFSET_INPUT_ROW_SIZE,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_depth) ==
+                  OFFSET_OUTPUT_DEPTH,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_row_size) ==
+                  OFFSET_OUTPUT_ROW_SIZE,
+              "");
+static_assert(offsetof(DepthwiseConvParams, filter_row_size) ==
+                  OFFSET_FILTER_ROW_SIZE,
+              "");
+static_assert(offsetof(DepthwiseConvParams, input_offset) ==
+                  OFFSET_INPUT_OFFSET,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_offset) ==
+                  OFFSET_OUTPUT_OFFSET,
+              "");
+static_assert(offsetof(DepthwiseConvParams, filter_offset) ==
+                  OFFSET_FILTER_OFFSET,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_multiplier) ==
+                  OFFSET_OUTPUT_MULTIPLIER,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_activation_min) ==
+                  OFFSET_OUTPUT_ACTIVATION_MIN,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_activation_max) ==
+                  OFFSET_OUTPUT_ACTIVATION_MAX,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_right_shift) ==
+                  OFFSET_OUTPUT_RIGHT_SHIFT,
+              "");
+static_assert(offsetof(DepthwiseConvParams, input_width) == OFFSET_INPUT_WIDTH,
+              "");
+static_assert(offsetof(DepthwiseConvParams, input_height) ==
+                  OFFSET_INPUT_HEIGHT,
+              "");
+static_assert(offsetof(DepthwiseConvParams, stride_width) ==
+                  OFFSET_STRIDE_WIDTH,
+              "");
+static_assert(offsetof(DepthwiseConvParams, stride_height) ==
+                  OFFSET_STRIDE_HEIGHT,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_width) ==
+                  OFFSET_OUTPUT_WIDTH,
+              "");
+static_assert(offsetof(DepthwiseConvParams, output_height) ==
+                  OFFSET_OUTPUT_HEIGHT,
+              "");
+#endif  // __aarch64__
+#endif  // ARM NEON
+
+#ifdef USE_NEON
+
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+
+template <>
+struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
+                                     1> {
+ public:
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         int64_t input_depth, int64_t input_row_size,
+                         int32 output_window_height, int32 output_window_width,
+                         const DepthwiseConvParams* params_ptr) {
+    const int64_t input_width_increment = 2 * input_depth;
+    const int64_t input_height_increment = 2 * input_row_size;
+    const int64_t output_height_increment = 2 * params_ptr->output_row_size;
+
+#define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "1"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "2"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "3"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "4"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "5"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "6"
+#define DEPTHWISECONV_LABEL_HEIGHT_1 "7"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "8"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "9"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "10"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_END "11"
+
+    asm volatile(
+        // Performs depthwise convolutions for a window specified by
+        // |output_window_height| and |output_window_width|. The inner-most loop
+        // processes 2x2 outputs, and any leftovers at the end.
+        //
+        // Algorithm works as follows:
+        //
+        //   1. Load filters of 8 depth (8x3x3). Registers v0--v8 hold filter
+        //      values.
+        //   2. For 2 output heights at a time:
+        //        i.  For 2 output widths at a time, load inputs for a 2x1 (2
+        //            height, 1 width) output window (4x3 input window).
+        //            Registers v9--v20 hold input values. Mul-add with
+        //            accumulators v21--v24. Then run activation, downquantize
+        //            and store. Repeat for the next 2x1 output window,
+        //            leveraging overlapping inputs.
+        //        ii. Handle single leftover width if exists.
+        //   3. Handle single leftover height if exists.
+        //        i.  For 2 output widths at a time, load inputs for a 1x2 (1
+        //            height, 2 width) output window (3x4 input window).
+        //            Registers v9--v20 hold input values. Mul-add with
+        //            accumulators v21--v24. Then run activation, downquantize
+        //            and store. Repeat for the next 1x2 output window,
+        //            leveraging overlapping inputs.
+        //        ii. Handle single leftover width if exists.
+        //
+        // Loads are placed as soon as the register is no longer needed and
+        // interleaved with arithmetic operations to take advantage of
+        // dual-issue pipelines. We also add input offsets as far from the loads
+        // as possible to give loads enough cycles to fetch data from memory.
+
+        // Set "constant" registers. These registers may be replaced with temp
+        // values from time to time when there are not enough NEON registers.
+        // We use x9--x15 general purpose registers as they are caller-saved
+        // temporary registers (see
+        // http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf).  // NOLINT
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr x3, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "cmp %w[output_window_height], #2\n"
+        "dup v26.8h, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
+        "dup v29.8h, w2\n"
+        "ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "dup v30.16b, w4\n"
+        "ldr w0, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v31.16b, w0\n"
+        "dup v28.4s, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+        "add x10, %[bias_ptr], #16\n"
+        "ldr x1, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
+        "dup v9.8h, w9\n"
+
+        // Load filters and add offsets.
+        "ld1 {v0.8b}, [%[filter_ptr]], x3\n"
+        "ld1 {v1.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v0.8h, v9.8h, v0.8b\n"
+        "ld1 {v2.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v1.8h, v9.8h, v1.8b\n"
+        "ld1 {v3.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v2.8h, v9.8h, v2.8b\n"
+        "ld1 {v4.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v3.8h, v9.8h, v3.8b\n"
+        "ld1 {v5.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v4.8h, v9.8h, v4.8b\n"
+        "ld1 {v6.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v5.8h, v9.8h, v5.8b\n"
+        "ld1 {v7.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v6.8h, v9.8h, v6.8b\n"
+        "ld1 {v8.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v7.8h, v9.8h, v7.8b\n"
+        "uaddw v8.8h, v9.8h, v8.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_HEIGHT_2_LOOP ":\n"
+          // This loop processes 2x2 outputs. To avoid register exhaustion,
+          // inputs for the left 2 outputs are loaded first, then the right
+          // two outputs.
+          "mov x11, %[input_ptr]\n"
+          "mov x12, x11\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "add x13, x11, %[input_row_size]\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          "add x14, x13, %[input_row_size]\n"
+          "ld1 {v11.8b}, [x12], %[input_depth]\n"
+          "add x15, x14, %[input_row_size]\n"
+          "ld1 {v12.8b}, [x13], %[input_depth]\n"
+          "mov w5, %w[output_window_width]\n"
+          "ld1 {v13.8b}, [x13], %[input_depth]\n"
+          "mov x6, %[output_ptr]\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "add x7, %[output_ptr], x1\n"
+          "ld1 {v15.8b}, [x14], %[input_depth]\n"
+          // The height 2 / width 2 loop loads an extra 2x1 outputs (2 height,
+          // 1 width) in anticipation for the next iteration. Make sure
+          // |output_window_width| is large enough to handle the additional
+          // loads, otherwise jump to specific the appropriate label to handle
+          // smaller widths.
+          "cmp w5, #2\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "ld1 {v16.8b}, [x14], %[input_depth]\n"
+          "uaddw v10.8h, v26.8h, v10.8b\n"
+          "ld1 {v17.8b}, [x14], %[input_depth]\n"
+          "uaddw v11.8h, v26.8h, v11.8b\n"
+          "ld1 {v18.8b}, [x15], %[input_depth]\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "ld1 {v19.8b}, [x15], %[input_depth]\n"
+          "uaddw v13.8h, v26.8h, v13.8b\n"
+          "ld1 {v20.8b}, [x15], %[input_depth]\n"
+          "uaddw v14.8h, v26.8h, v14.8b\n"
+          "ld1 {v21.4s}, [%[bias_ptr]]\n"
+          "uaddw v15.8h, v26.8h, v15.8b\n"
+          "ld1 {v22.4s}, [x10]\n"
+          "uaddw v16.8h, v26.8h, v16.8b\n"
+          "ld1 {v23.4s}, [%[bias_ptr]]\n"
+          "uaddw v17.8h, v26.8h, v17.8b\n"
+          "ld1 {v24.4s}, [x10]\n"
+          "uaddw v18.8h, v26.8h, v18.8b\n"
+          "uaddw v19.8h, v26.8h, v19.8b\n"
+          "uaddw v20.8h, v26.8h, v20.8b\n"
+
+          "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n"
+          "cmp w5, #1\n"
+          "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+          //"loop_%=:\n"
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP ":\n"
+            // Mul-add left outputs.
+            "smlal v21.4s, v0.4h, v9.4h\n"
+            "subs w5, w5, #2\n"
+            "smlal2 v22.4s, v0.8h, v9.8h\n"
+            "cmp w5, #3\n"
+            "smlal v23.4s, v0.4h, v12.4h\n"
+            "ld1 {v9.8b}, [x12]\n"
+            "smlal2 v24.4s, v0.8h, v12.8h\n"
+            "smlal v21.4s, v1.4h, v10.4h\n"
+            "smlal2 v22.4s, v1.8h, v10.8h\n"
+            "smlal v23.4s, v1.4h, v13.4h\n"
+            "smlal2 v24.4s, v1.8h, v13.8h\n"
+            "smlal v21.4s, v2.4h, v11.4h\n"
+            "smlal2 v22.4s, v2.8h, v11.8h\n"
+            "smlal v23.4s, v2.4h, v14.4h\n"
+            "smlal2 v24.4s, v2.8h, v14.8h\n"
+            "smlal v21.4s, v3.4h, v12.4h\n"
+            "smlal2 v22.4s, v3.8h, v12.8h\n"
+            "ld1 {v12.8b}, [x13]\n"
+            "smlal v23.4s, v3.4h, v15.4h\n"
+            "smlal2 v24.4s, v3.8h, v15.8h\n"
+            "smlal v21.4s, v4.4h, v13.4h\n"
+            "smlal2 v22.4s, v4.8h, v13.8h\n"
+            "smlal v23.4s, v4.4h, v16.4h\n"
+            "smlal2 v24.4s, v4.8h, v16.8h\n"
+            "smlal v21.4s, v5.4h, v14.4h\n"
+            "smlal2 v22.4s, v5.8h, v14.8h\n"
+            "smlal v23.4s, v5.4h, v17.4h\n"
+            "smlal2 v24.4s, v5.8h, v17.8h\n"
+            "smlal v21.4s, v6.4h, v15.4h\n"
+            "smlal2 v22.4s, v6.8h, v15.8h\n"
+            "ld1 {v15.8b}, [x14]\n"
+            "smlal v23.4s, v6.4h, v18.4h\n"
+            "smlal2 v24.4s, v6.8h, v18.8h\n"
+            "ld1 {v18.8b}, [x15]\n"
+            "smlal v21.4s, v7.4h, v16.4h\n"
+            "smlal2 v22.4s, v7.8h, v16.8h\n"
+            "smlal v23.4s, v7.4h, v19.4h\n"
+            "smlal2 v24.4s, v7.8h, v19.8h\n"
+            "smlal v21.4s, v8.4h, v17.4h\n"
+            "smlal2 v22.4s, v8.8h, v17.8h\n"
+            "smlal v23.4s, v8.4h, v20.4h\n"
+            "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+            "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+            "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+            "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+            "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+            "sqrshl v21.4s, v21.4s, v28.4s\n"
+            "sqrshl v22.4s, v22.4s, v28.4s\n"
+            "sqrshl v23.4s, v23.4s, v28.4s\n"
+            "sqrshl v24.4s, v24.4s, v28.4s\n"
+            "sqxtn v21.4h, v21.4s\n"
+            "sqxtn2 v21.8h, v22.4s\n"
+            "sqxtn v23.4h, v23.4s\n"
+            "sqxtn2 v23.8h, v24.4s\n"
+            "sqadd v21.8h, v21.8h, v29.8h\n"
+            "sqadd v23.8h, v23.8h, v29.8h\n"
+            "sqxtun v21.8b, v21.8h\n"
+            "sqxtun2 v21.16b, v23.8h\n"
+            "ld1 {v22.4s}, [x10]\n"
+            "umax v21.16b, v21.16b, v30.16b\n"
+            "umin v21.16b, v21.16b, v31.16b\n"
+            "ld1 {v24.4s}, [x10]\n"
+            "uaddw v9.8h, v26.8h, v9.8b\n"
+            "st1 {v21.8b}, [x6], x3\n"
+            "uaddw v12.8h, v26.8h, v12.8b\n"
+            "mov v23.d[0], v21.d[1]\n"
+            "st1 {v23.8b}, [x7], x3\n"
+            "uaddw v15.8h, v26.8h, v15.8b\n"
+            "ld1 {v21.4s}, [%[bias_ptr]]\n"
+            "uaddw v18.8h, v26.8h, v18.8b\n"
+            "ld1 {v23.4s}, [%[bias_ptr]]\n"
+
+            // Mul-add right outputs.
+            "smlal v21.4s, v0.4h, v10.4h\n"
+            "add x11, x11, %[input_width_increment]\n"
+            "smlal2 v22.4s, v0.8h, v10.8h\n"
+            "mov x12, x11\n"
+            "smlal v23.4s, v0.4h, v13.4h\n"
+            "add x13, x11, %[input_row_size]\n"
+            "smlal2 v24.4s, v0.8h, v13.8h\n"
+            "add x14, x13, %[input_row_size]\n"
+            "smlal v21.4s, v1.4h, v11.4h\n"
+            "add x15, x14, %[input_row_size]\n"
+            "smlal2 v22.4s, v1.8h, v11.8h\n"
+            "smlal v23.4s, v1.4h, v14.4h\n"
+            "smlal2 v24.4s, v1.8h, v14.8h\n"
+            "smlal v21.4s, v2.4h, v9.4h\n"
+            "smlal2 v22.4s, v2.8h, v9.8h\n"
+            "ld1 {v9.8b}, [x12], %[input_depth]\n"
+            "smlal v23.4s, v2.4h, v12.4h\n"
+            "ld1 {v10.8b}, [x12], %[input_depth]\n"
+            "smlal2 v24.4s, v2.8h, v12.8h\n"
+            "ld1 {v11.8b}, [x12], %[input_depth]\n"
+            "smlal v21.4s, v3.4h, v13.4h\n"
+            "smlal2 v22.4s, v3.8h, v13.8h\n"
+            "smlal v23.4s, v3.4h, v16.4h\n"
+            "smlal2 v24.4s, v3.8h, v16.8h\n"
+            "smlal v21.4s, v4.4h, v14.4h\n"
+            "smlal2 v22.4s, v4.8h, v14.8h\n"
+            "smlal v23.4s, v4.4h, v17.4h\n"
+            "smlal2 v24.4s, v4.8h, v17.8h\n"
+            "smlal v21.4s, v5.4h, v12.4h\n"
+            "smlal2 v22.4s, v5.8h, v12.8h\n"
+            "ld1 {v12.8b}, [x13], %[input_depth]\n"
+            "smlal v23.4s, v5.4h, v15.4h\n"
+            "ld1 {v13.8b}, [x13], %[input_depth]\n"
+            "smlal2 v24.4s, v5.8h, v15.8h\n"
+            "ld1 {v14.8b}, [x13], %[input_depth]\n"
+            "smlal v21.4s, v6.4h, v16.4h\n"
+            "smlal2 v22.4s, v6.8h, v16.8h\n"
+            "smlal v23.4s, v6.4h, v19.4h\n"
+            "smlal2 v24.4s, v6.8h, v19.8h\n"
+            "smlal v21.4s, v7.4h, v17.4h\n"
+            "smlal2 v22.4s, v7.8h, v17.8h\n"
+            "smlal v23.4s, v7.4h, v20.4h\n"
+            "smlal2 v24.4s, v7.8h, v20.8h\n"
+            "smlal v21.4s, v8.4h, v15.4h\n"
+            "smlal2 v22.4s, v8.8h, v15.8h\n"
+            "ld1 {v15.8b}, [x14], %[input_depth]\n"
+            "smlal v23.4s, v8.4h, v18.4h\n"
+            "ld1 {v16.8b}, [x14], %[input_depth]\n"
+            "smlal2 v24.4s, v8.8h, v18.8h\n"
+            "ld1 {v17.8b}, [x14], %[input_depth]\n"
+
+            "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+            "ld1 {v18.8b}, [x15], %[input_depth]\n"
+            "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+            "ld1 {v19.8b}, [x15], %[input_depth]\n"
+            "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+            "ld1 {v20.8b}, [x15], %[input_depth]\n"
+            "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+            "sqrshl v21.4s, v21.4s, v28.4s\n"
+            "sqrshl v22.4s, v22.4s, v28.4s\n"
+            "sqrshl v23.4s, v23.4s, v28.4s\n"
+            "sqrshl v24.4s, v24.4s, v28.4s\n"
+            "sqxtn v21.4h, v21.4s\n"
+            "sqxtn2 v21.8h, v22.4s\n"
+            "sqxtn v23.4h, v23.4s\n"
+            "sqxtn2 v23.8h, v24.4s\n"
+            "sqadd v21.8h, v21.8h, v29.8h\n"
+            "sqadd v23.8h, v23.8h, v29.8h\n"
+            "sqxtun v21.8b, v21.8h\n"
+            "sqxtun2 v21.16b, v23.8h\n"
+            "ld1 {v22.4s}, [x10]\n"
+            "umax v21.16b, v21.16b, v30.16b\n"
+            "umin v21.16b, v21.16b, v31.16b\n"
+            "ld1 {v24.4s}, [x10]\n"
+            "uaddw v9.8h, v26.8h, v9.8b\n"
+            "st1 {v21.8b}, [x6], x3\n"
+            "uaddw v10.8h, v26.8h, v10.8b\n"
+            "mov v23.d[0], v21.d[1]\n"
+            "st1 {v23.8b}, [x7], x3\n"
+            "uaddw v11.8h, v26.8h, v11.8b\n"
+            "uaddw v12.8h, v26.8h, v12.8b\n"
+            "uaddw v13.8h, v26.8h, v13.8b\n"
+            "uaddw v14.8h, v26.8h, v14.8b\n"
+            "uaddw v15.8h, v26.8h, v15.8b\n"
+            "ld1 {v21.4s}, [%[bias_ptr]]\n"
+            "uaddw v16.8h, v26.8h, v16.8b\n"
+            "ld1 {v23.4s}, [%[bias_ptr]]\n"
+            "uaddw v17.8h, v26.8h, v17.8b\n"
+            "uaddw v18.8h, v26.8h, v18.8b\n"
+            "uaddw v19.8h, v26.8h, v19.8b\n"
+            "uaddw v20.8h, v26.8h, v20.8b\n"
+
+            "bge " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "b\n"
+
+          // At this point, there will be one of 2 width or 1 width leftover,
+          // not both.
+          "cmp w5, #2\n"
+          "blt " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+          // Handle last 2 columns if exists.
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER ":\n"
+          // Mul-add left outputs.
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "smlal v23.4s, v0.4h, v12.4h\n"
+          "ld1 {v9.8b}, [x12]\n"
+          "smlal2 v24.4s, v0.8h, v12.8h\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "smlal v23.4s, v1.4h, v13.4h\n"
+          "smlal2 v24.4s, v1.8h, v13.8h\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "smlal v23.4s, v2.4h, v14.4h\n"
+          "smlal2 v24.4s, v2.8h, v14.8h\n"
+          "smlal v21.4s, v3.4h, v12.4h\n"
+          "smlal2 v22.4s, v3.8h, v12.8h\n"
+          "ld1 {v12.8b}, [x13]\n"
+          "smlal v23.4s, v3.4h, v15.4h\n"
+          "smlal2 v24.4s, v3.8h, v15.8h\n"
+          "smlal v21.4s, v4.4h, v13.4h\n"
+          "smlal2 v22.4s, v4.8h, v13.8h\n"
+          "smlal v23.4s, v4.4h, v16.4h\n"
+          "smlal2 v24.4s, v4.8h, v16.8h\n"
+          "smlal v21.4s, v5.4h, v14.4h\n"
+          "smlal2 v22.4s, v5.8h, v14.8h\n"
+          "smlal v23.4s, v5.4h, v17.4h\n"
+          "smlal2 v24.4s, v5.8h, v17.8h\n"
+          "smlal v21.4s, v6.4h, v15.4h\n"
+          "smlal2 v22.4s, v6.8h, v15.8h\n"
+          "ld1 {v15.8b}, [x14]\n"
+          "smlal v23.4s, v6.4h, v18.4h\n"
+          "smlal2 v24.4s, v6.8h, v18.8h\n"
+          "ld1 {v18.8b}, [x15]\n"
+          "smlal v21.4s, v7.4h, v16.4h\n"
+          "smlal2 v22.4s, v7.8h, v16.8h\n"
+          "smlal v23.4s, v7.4h, v19.4h\n"
+          "smlal2 v24.4s, v7.8h, v19.8h\n"
+          "smlal v21.4s, v8.4h, v17.4h\n"
+          "smlal2 v22.4s, v8.8h, v17.8h\n"
+          "smlal v23.4s, v8.4h, v20.4h\n"
+          "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "sqrshl v21.4s, v21.4s, v28.4s\n"
+          "sqrshl v22.4s, v22.4s, v28.4s\n"
+          "sqrshl v23.4s, v23.4s, v28.4s\n"
+          "sqrshl v24.4s, v24.4s, v28.4s\n"
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqadd v21.8h, v21.8h, v29.8h\n"
+          "sqadd v23.8h, v23.8h, v29.8h\n"
+          "sqxtun v21.8b, v21.8h\n"
+          "sqxtun2 v21.16b, v23.8h\n"
+          "ld1 {v22.4s}, [x10]\n"
+          "umax v21.16b, v21.16b, v30.16b\n"
+          "umin v21.16b, v21.16b, v31.16b\n"
+          "ld1 {v24.4s}, [x10]\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "st1 {v21.8b}, [x6], x3\n"
+          "mov v23.d[0], v21.d[1]\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "st1 {v23.8b}, [x7], x3\n"
+          "uaddw v15.8h, v26.8h, v15.8b\n"
+          "ld1 {v21.4s}, [%[bias_ptr]]\n"
+          "uaddw v18.8h, v26.8h, v18.8b\n"
+          "ld1 {v23.4s}, [%[bias_ptr]]\n"
+
+          // Mul-add right outputs.
+          "smlal v21.4s, v0.4h, v10.4h\n"
+          "smlal2 v22.4s, v0.8h, v10.8h\n"
+          "smlal v23.4s, v0.4h, v13.4h\n"
+          "smlal2 v24.4s, v0.8h, v13.8h\n"
+          "smlal v21.4s, v1.4h, v11.4h\n"
+          "smlal2 v22.4s, v1.8h, v11.8h\n"
+          "smlal v23.4s, v1.4h, v14.4h\n"
+          "smlal2 v24.4s, v1.8h, v14.8h\n"
+          "smlal v21.4s, v2.4h, v9.4h\n"
+          "smlal2 v22.4s, v2.8h, v9.8h\n"
+          "smlal v23.4s, v2.4h, v12.4h\n"
+          "smlal2 v24.4s, v2.8h, v12.8h\n"
+          "smlal v21.4s, v3.4h, v13.4h\n"
+          "smlal2 v22.4s, v3.8h, v13.8h\n"
+          "smlal v23.4s, v3.4h, v16.4h\n"
+          "smlal2 v24.4s, v3.8h, v16.8h\n"
+          "smlal v21.4s, v4.4h, v14.4h\n"
+          "smlal2 v22.4s, v4.8h, v14.8h\n"
+          "smlal v23.4s, v4.4h, v17.4h\n"
+          "smlal2 v24.4s, v4.8h, v17.8h\n"
+          "smlal v21.4s, v5.4h, v12.4h\n"
+          "smlal2 v22.4s, v5.8h, v12.8h\n"
+          "smlal v23.4s, v5.4h, v15.4h\n"
+          "smlal2 v24.4s, v5.8h, v15.8h\n"
+          "smlal v21.4s, v6.4h, v16.4h\n"
+          "smlal2 v22.4s, v6.8h, v16.8h\n"
+          "smlal v23.4s, v6.4h, v19.4h\n"
+          "smlal2 v24.4s, v6.8h, v19.8h\n"
+          "smlal v21.4s, v7.4h, v17.4h\n"
+          "smlal2 v22.4s, v7.8h, v17.8h\n"
+          "smlal v23.4s, v7.4h, v20.4h\n"
+          "smlal2 v24.4s, v7.8h, v20.8h\n"
+          "smlal v21.4s, v8.4h, v15.4h\n"
+          "smlal2 v22.4s, v8.8h, v15.8h\n"
+          "smlal v23.4s, v8.4h, v18.4h\n"
+          "smlal2 v24.4s, v8.8h, v18.8h\n"
+
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "sqrshl v21.4s, v21.4s, v28.4s\n"
+          "sqrshl v22.4s, v22.4s, v28.4s\n"
+          "sqrshl v23.4s, v23.4s, v28.4s\n"
+          "sqrshl v24.4s, v24.4s, v28.4s\n"
+
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqadd v21.8h, v21.8h, v29.8h\n"
+          "sqadd v23.8h, v23.8h, v29.8h\n"
+          "sqxtun v21.8b, v21.8h\n"
+          "sqxtun2 v21.16b, v23.8h\n"
+          "umax v21.16b, v21.16b, v30.16b\n"
+          "umin v21.16b, v21.16b, v31.16b\n"
+          "st1 {v21.8b}, [x6], x3\n"
+          "mov v23.d[0], v21.d[1]\n"
+          "st1 {v23.8b}, [x7], x3\n"
+          "b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
+
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER ":\n"
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "smlal v23.4s, v0.4h, v12.4h\n"
+          "smlal2 v24.4s, v0.8h, v12.8h\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "smlal v23.4s, v1.4h, v13.4h\n"
+          "smlal2 v24.4s, v1.8h, v13.8h\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "smlal v23.4s, v2.4h, v14.4h\n"
+          "smlal2 v24.4s, v2.8h, v14.8h\n"
+          "smlal v21.4s, v3.4h, v12.4h\n"
+          "smlal2 v22.4s, v3.8h, v12.8h\n"
+          "smlal v23.4s, v3.4h, v15.4h\n"
+          "smlal2 v24.4s, v3.8h, v15.8h\n"
+          "smlal v21.4s, v4.4h, v13.4h\n"
+          "smlal2 v22.4s, v4.8h, v13.8h\n"
+          "smlal v23.4s, v4.4h, v16.4h\n"
+          "smlal2 v24.4s, v4.8h, v16.8h\n"
+          "smlal v21.4s, v5.4h, v14.4h\n"
+          "smlal2 v22.4s, v5.8h, v14.8h\n"
+          "smlal v23.4s, v5.4h, v17.4h\n"
+          "smlal2 v24.4s, v5.8h, v17.8h\n"
+          "smlal v21.4s, v6.4h, v15.4h\n"
+          "smlal2 v22.4s, v6.8h, v15.8h\n"
+          "smlal v23.4s, v6.4h, v18.4h\n"
+          "smlal2 v24.4s, v6.8h, v18.8h\n"
+          "smlal v21.4s, v7.4h, v16.4h\n"
+          "smlal2 v22.4s, v7.8h, v16.8h\n"
+          "smlal v23.4s, v7.4h, v19.4h\n"
+          "smlal2 v24.4s, v7.8h, v19.8h\n"
+          "smlal v21.4s, v8.4h, v17.4h\n"
+          "smlal2 v22.4s, v8.8h, v17.8h\n"
+          "smlal v23.4s, v8.4h, v20.4h\n"
+          "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "sqrshl v21.4s, v21.4s, v28.4s\n"
+          "sqrshl v22.4s, v22.4s, v28.4s\n"
+          "sqrshl v23.4s, v23.4s, v28.4s\n"
+          "sqrshl v24.4s, v24.4s, v28.4s\n"
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqadd v21.8h, v21.8h, v29.8h\n"
+          "sqadd v23.8h, v23.8h, v29.8h\n"
+          "sqxtun v21.8b, v21.8h\n"
+          "sqxtun2 v21.16b, v23.8h\n"
+          "umax v21.16b, v21.16b, v30.16b\n"
+          "umin v21.16b, v21.16b, v31.16b\n"
+          "st1 {v21.8b}, [x6], x3\n"
+          "mov v23.d[0], v21.d[1]\n"
+          "st1 {v23.8b}, [x7], x3\n"
+
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
+          "subs %w[output_window_height], %w[output_window_height], #2\n"
+          "add %[input_ptr], %[input_ptr], %[input_height_increment]\n"
+          "cmp %w[output_window_height], #2\n"
+          "add %[output_ptr], %[output_ptr], %[output_height_increment]\n"
+          "bge " DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP ":\n"
+        "cmp %w[output_window_height], #1\n"
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_1 ":\n"
+        "mov x12, %[input_ptr]\n"
+        "ld1 {v9.8b}, [x12], %[input_depth]\n"
+        "add x13, %[input_ptr], %[input_row_size]\n"
+        "ld1 {v10.8b}, [x12], %[input_depth]\n"
+        "add x14, x13, %[input_row_size]\n"
+        "ld1 {v11.8b}, [x12], %[input_depth]\n"
+        "add x15, x14, %[input_row_size]\n"
+        "mov w5, %w[output_window_width]\n"
+        "ld1 {v13.8b}, [x13], %[input_depth]\n"
+        "mov x6, %[output_ptr]\n"
+        "ld1 {v14.8b}, [x13], %[input_depth]\n"
+        "add x7, %[output_ptr], x1\n"
+        "ld1 {v15.8b}, [x13], %[input_depth]\n"
+        // The height 1 / width 2 loop loads an extra 1x1 output in anticipation
+        // for the next iteration. Make sure |output_window_width| is large
+        // enough to handle the additional load, otherwise jump to the
+        // appropriate label to handle smaller widths.
+        "cmp w5, #2\n"
+        "ld1 {v17.8b}, [x14], %[input_depth]\n"
+        "ld1 {v18.8b}, [x14], %[input_depth]\n"
+        "ld1 {v19.8b}, [x14], %[input_depth]\n"
+        "ld1 {v21.4s}, [%[bias_ptr]]\n"
+        "ld1 {v22.4s}, [x10]\n"
+        "ld1 {v23.4s}, [%[bias_ptr]]\n"
+        "ld1 {v24.4s}, [x10]\n"
+
+        "uaddw v9.8h, v26.8h, v9.8b\n"
+        "uaddw v10.8h, v26.8h, v10.8b\n"
+        "uaddw v11.8h, v26.8h, v11.8b\n"
+        "uaddw v13.8h, v26.8h, v13.8b\n"
+        "uaddw v14.8h, v26.8h, v14.8b\n"
+        "uaddw v15.8h, v26.8h, v15.8b\n"
+        "uaddw v17.8h, v26.8h, v17.8b\n"
+        "uaddw v18.8h, v26.8h, v18.8b\n"
+        "uaddw v19.8h, v26.8h, v19.8b\n"
+
+        "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "f\n"
+        "cmp w5, #1\n"
+        "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP ":\n"
+          // Load inputs for 3x4 input window which corresponds to a 1x2 output
+          // window.
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "ld1 {v12.8b}, [x12]\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "ld1 {v16.8b}, [x13]\n"
+          "smlal v23.4s, v0.4h, v10.4h\n"
+          "ld1 {v20.8b}, [x14]\n"
+          "smlal2 v24.4s, v0.8h, v10.8h\n"
+          "subs w5, w5, #2\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "cmp w5, #3\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "add %[input_ptr], %[input_ptr], %[input_width_increment]\n"
+          "smlal v23.4s, v1.4h, v11.4h\n"
+          "mov x12, %[input_ptr]\n"
+          "smlal2 v24.4s, v1.8h, v11.8h\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x12], %[input_depth]\n"
+          "add x13, %[input_ptr], %[input_row_size]\n"
+          "smlal v23.4s, v2.4h, v12.4h\n"
+          "add x14, x13, %[input_row_size]\n"
+          "smlal2 v24.4s, v2.8h, v12.8h\n"
+          "smlal v21.4s, v3.4h, v13.4h\n"
+          "add x15, x14, %[input_row_size]\n"
+          "smlal2 v22.4s, v3.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v3.4h, v14.4h\n"
+          "smlal2 v24.4s, v3.8h, v14.8h\n"
+          "smlal v21.4s, v4.4h, v14.4h\n"
+          "smlal2 v22.4s, v4.8h, v14.8h\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v4.4h, v15.4h\n"
+          "smlal2 v24.4s, v4.8h, v15.8h\n"
+          "smlal v21.4s, v5.4h, v15.4h\n"
+          "uaddw v16.8h, v26.8h, v16.8b\n"
+          "smlal2 v22.4s, v5.8h, v15.8h\n"
+          "ld1 {v15.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v5.4h, v16.4h\n"
+          "smlal2 v24.4s, v5.8h, v16.8h\n"
+          "smlal v21.4s, v6.4h, v17.4h\n"
+          "smlal2 v22.4s, v6.8h, v17.8h\n"
+          "ld1 {v17.8b}, [x14], %[input_depth]\n"
+          "smlal v23.4s, v6.4h, v18.4h\n"
+          "smlal2 v24.4s, v6.8h, v18.8h\n"
+          "smlal v21.4s, v7.4h, v18.4h\n"
+          "smlal2 v22.4s, v7.8h, v18.8h\n"
+          "ld1 {v18.8b}, [x14], %[input_depth]\n"
+          "smlal v23.4s, v7.4h, v19.4h\n"
+          "smlal2 v24.4s, v7.8h, v19.8h\n"
+          "smlal v21.4s, v8.4h, v19.4h\n"
+          "uaddw v20.8h, v26.8h, v20.8b\n"
+          "smlal2 v22.4s, v8.8h, v19.8h\n"
+          "ld1 {v19.8b}, [x14], %[input_depth]\n"
+          "smlal v23.4s, v8.4h, v20.4h\n"
+          "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "sqrshl v21.4s, v21.4s, v28.4s\n"
+          "sqrshl v22.4s, v22.4s, v28.4s\n"
+          "sqrshl v23.4s, v23.4s, v28.4s\n"
+          "sqrshl v24.4s, v24.4s, v28.4s\n"
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqadd v21.8h, v21.8h, v29.8h\n"
+          "sqadd v23.8h, v23.8h, v29.8h\n"
+          "sqxtun v21.8b, v21.8h\n"
+          "sqxtun2 v21.16b, v23.8h\n"
+          "ld1 {v22.4s}, [x10]\n"
+          "umax v21.16b, v21.16b, v30.16b\n"
+          "umin v21.16b, v21.16b, v31.16b\n"
+          "ld1 {v24.4s}, [x10]\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "st1 {v21.8b}, [%[output_ptr]], x3\n"
+          "uaddw v10.8h, v26.8h, v10.8b\n"
+          "mov v23.d[0], v21.d[1]\n"
+          "st1 {v23.8b}, [%[output_ptr]], x3\n"
+          "uaddw v11.8h, v26.8h, v11.8b\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "uaddw v13.8h, v26.8h, v13.8b\n"
+          "uaddw v14.8h, v26.8h, v14.8b\n"
+          "uaddw v15.8h, v26.8h, v15.8b\n"
+          "ld1 {v21.4s}, [%[bias_ptr]]\n"
+          "uaddw v16.8h, v26.8h, v16.8b\n"
+          "ld1 {v23.4s}, [%[bias_ptr]]\n"
+          "uaddw v17.8h, v26.8h, v17.8b\n"
+          "uaddw v18.8h, v26.8h, v18.8b\n"
+          "uaddw v19.8h, v26.8h, v19.8b\n"
+          "uaddw v20.8h, v26.8h, v20.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "b\n"
+
+        // At this point, there will be one of 2 width or 1 width leftover,
+        // not both.
+        "cmp w5, #2\n"
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+        // Handle last two horizontal outputs if exists.
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER ":\n"
+        "smlal v21.4s, v0.4h, v9.4h\n"
+        "ld1 {v12.8b}, [x12], %[input_depth]\n"
+        "smlal2 v22.4s, v0.8h, v9.8h\n"
+        "ld1 {v16.8b}, [x13], %[input_depth]\n"
+        "smlal v23.4s, v0.4h, v10.4h\n"
+        "ld1 {v20.8b}, [x14], %[input_depth]\n"
+        "smlal2 v24.4s, v0.8h, v10.8h\n"
+        "smlal v21.4s, v1.4h, v10.4h\n"
+        "smlal2 v22.4s, v1.8h, v10.8h\n"
+        "smlal v23.4s, v1.4h, v11.4h\n"
+        "smlal2 v24.4s, v1.8h, v11.8h\n"
+        "smlal v21.4s, v2.4h, v11.4h\n"
+        "uaddw v12.8h, v26.8h, v12.8b\n"
+        "smlal2 v22.4s, v2.8h, v11.8h\n"
+        "smlal v23.4s, v2.4h, v12.4h\n"
+        "smlal2 v24.4s, v2.8h, v12.8h\n"
+        "smlal v21.4s, v3.4h, v13.4h\n"
+        "smlal2 v22.4s, v3.8h, v13.8h\n"
+        "smlal v23.4s, v3.4h, v14.4h\n"
+        "smlal2 v24.4s, v3.8h, v14.8h\n"
+        "smlal v21.4s, v4.4h, v14.4h\n"
+        "smlal2 v22.4s, v4.8h, v14.8h\n"
+        "smlal v23.4s, v4.4h, v15.4h\n"
+        "smlal2 v24.4s, v4.8h, v15.8h\n"
+        "smlal v21.4s, v5.4h, v15.4h\n"
+        "uaddw v16.8h, v26.8h, v16.8b\n"
+        "smlal2 v22.4s, v5.8h, v15.8h\n"
+        "smlal v23.4s, v5.4h, v16.4h\n"
+        "smlal2 v24.4s, v5.8h, v16.8h\n"
+        "smlal v21.4s, v6.4h, v17.4h\n"
+        "smlal2 v22.4s, v6.8h, v17.8h\n"
+        "smlal v23.4s, v6.4h, v18.4h\n"
+        "smlal2 v24.4s, v6.8h, v18.8h\n"
+        "smlal v21.4s, v7.4h, v18.4h\n"
+        "smlal2 v22.4s, v7.8h, v18.8h\n"
+        "smlal v23.4s, v7.4h, v19.4h\n"
+        "smlal2 v24.4s, v7.8h, v19.8h\n"
+        "smlal v21.4s, v8.4h, v19.4h\n"
+        "uaddw v20.8h, v26.8h, v20.8b\n"
+        "smlal2 v22.4s, v8.8h, v19.8h\n"
+        "smlal v23.4s, v8.4h, v20.4h\n"
+        "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+        "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+        "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+        "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+        "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+        "sqrshl v21.4s, v21.4s, v28.4s\n"
+        "sqrshl v22.4s, v22.4s, v28.4s\n"
+        "sqrshl v23.4s, v23.4s, v28.4s\n"
+        "sqrshl v24.4s, v24.4s, v28.4s\n"
+        "sqxtn v21.4h, v21.4s\n"
+        "sqxtn2 v21.8h, v22.4s\n"
+        "sqxtn v23.4h, v23.4s\n"
+        "sqxtn2 v23.8h, v24.4s\n"
+        "sqadd v21.8h, v21.8h, v29.8h\n"
+        "sqadd v23.8h, v23.8h, v29.8h\n"
+        "sqxtun v21.8b, v21.8h\n"
+        "sqxtun2 v21.16b, v23.8h\n"
+        "umax v21.16b, v21.16b, v30.16b\n"
+        "umin v21.16b, v21.16b, v31.16b\n"
+        "st1 {v21.8b}, [%[output_ptr]], x3\n"
+        "mov v23.d[0], v21.d[1]\n"
+        "st1 {v23.8b}, [%[output_ptr]], x3\n"
+        "b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+        // Handle bottom right output if exists.
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
+        "smlal v21.4s, v0.4h, v9.4h\n"
+        "smlal2 v22.4s, v0.8h, v9.8h\n"
+        "smlal v21.4s, v1.4h, v10.4h\n"
+        "smlal2 v22.4s, v1.8h, v10.8h\n"
+        "smlal v21.4s, v2.4h, v11.4h\n"
+        "smlal2 v22.4s, v2.8h, v11.8h\n"
+        "smlal v21.4s, v3.4h, v13.4h\n"
+        "smlal2 v22.4s, v3.8h, v13.8h\n"
+        "smlal v21.4s, v4.4h, v14.4h\n"
+        "smlal2 v22.4s, v4.8h, v14.8h\n"
+        "smlal v21.4s, v5.4h, v15.4h\n"
+        "smlal2 v22.4s, v5.8h, v15.8h\n"
+        "smlal v21.4s, v6.4h, v17.4h\n"
+        "smlal2 v22.4s, v6.8h, v17.8h\n"
+        "smlal v21.4s, v7.4h, v18.4h\n"
+        "smlal2 v22.4s, v7.8h, v18.8h\n"
+        "smlal v21.4s, v8.4h, v19.4h\n"
+        "smlal2 v22.4s, v8.8h, v19.8h\n"
+
+        "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+        "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+        "sqrshl v21.4s, v21.4s, v28.4s\n"
+        "sqrshl v22.4s, v22.4s, v28.4s\n"
+        "sqxtn v21.4h, v21.4s\n"
+        "sqxtn2 v21.8h, v22.4s\n"
+        "sqadd v21.8h, v21.8h, v29.8h\n"
+        "sqxtun v21.8b, v21.8h\n"
+        "umax v21.8b, v21.8b, v30.8b\n"
+        "umin v21.8b, v21.8b, v31.8b\n"
+        "st1 {v21.8b}, [%[output_ptr]]\n"
+        DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
+    :
+    // Outputs.
+    [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+    [output_ptr] "+r"(output_ptr),
+    [output_window_height] "+r"(output_window_height)
+    :
+    // Inputs.
+    [bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
+    [input_depth] "r"(input_depth),
+    [output_window_width] "r"(output_window_width),
+    [input_width_increment] "r"(input_width_increment),
+    [input_height_increment] "r"(input_height_increment),
+    [output_height_increment] "r"(output_height_increment),
+    [params_ptr] "r"(params_ptr)
+    :
+    // Clobbers.
+    "cc", "memory",
+    // We use these NEON registers.
+    "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+    "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+    "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+    "v30", "v31",
+    // We use these general-purpose registers.
+    "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+    "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_END
+  }
+};
+
+template <>
+struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
+                                     2> {
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         int64_t input_depth, int64_t input_row_size,
+                         int32 output_window_height, int32 output_window_width,
+                         const DepthwiseConvParams* params_ptr) {
+    const int64_t input_width_increment = 4 * input_depth;
+    const int64_t input_height_increment = 4 * input_row_size;
+    const int64_t output_height_increment = 2 * params_ptr->output_row_size;
+
+#define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "1"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "2"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "3"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "4"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "5"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "6"
+#define DEPTHWISECONV_LABEL_HEIGHT_1 "7"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "8"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "9"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "10"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_END "11"
+
+    asm volatile(
+        // Performs depthwise convolutions for a window specified by
+        // |output_window_height| and |output_window_width|. The inner-most loop
+        // processes 2x2 outputs, and any leftovers at the end.
+        //
+        // Algorithm works as follows:
+        //
+        //   1. Load filters of 8 depth (8x3x3). Registers v0--v8 hold filter
+        //      values.
+        //   2. For 2 output heights at a time:
+        //        i.  For 2 output widths at a time at stride 2, a 5x5 input
+        //            window is required. To avoid register exhaustion, we load
+        //            the first 2 rows of the 5x5 input window into registers
+        //            v9--v18, and use the same registers to load the next 2
+        //            rows, and finally v9--v13 to load the last row.
+        //            Accumulators for all 2x2 outputs are reserved by registers
+        //            v21-v22 (top left output), v23-v24 (top right output),
+        //            v19-v20 (bottom left output), v25-v26 (bottom right
+        //            output).
+        //        ii. Handle single leftover width if exists.
+        //   3. Handle single leftover height if exists.
+        //        i.  For 2 output widths at a time at stride 2, load inputs for
+        //            a 1x2 (1 height, 2 width) output window (3x5 input
+        //            window). Registers v9--v24 hold input values. Mul-add with
+        //            accumulators v24--v27.
+        //        ii. Handle single leftover width if exists.
+        //
+        // Loads are placed as soon as the register is no longer needed and
+        // interleaved with arithmetic operations to take advantage of
+        // dual-issue pipelines. We also add input offsets as far from the loads
+        // as possible to give loads enough cycles to fetch data from memory.
+
+        // Set "constant" registers. These registers may be replaced with temp
+        // values from time to time when there are not enough NEON registers.
+        // We use x9--x15 general purpose registers as they are caller-saved
+        // temporary registers (see http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf).  // NOLINT
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
+        "ldr w0, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "cmp %w[output_window_height], #2\n"
+        "dup v28.8h, w0\n"
+        "ldr w1, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "dup v26.4s, w9\n"
+        "ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w1\n"
+        "ldr w3, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "dup v29.8h, w2\n"
+        "ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.16b, w3\n"
+        "ldr x5, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "dup v31.16b, w4\n"
+        "ldr x19, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
+        "ldr w20, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+
+        // Load filters and add offsets.
+        "add x10, %[bias_ptr], #16\n"
+        "ld1 {v0.8b}, [%[filter_ptr]], x5\n"
+        "dup v9.8h, w20\n"
+        "ld1 {v1.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v0.8h, v9.8h, v0.8b\n"
+        "ld1 {v2.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v1.8h, v9.8h, v1.8b\n"
+        "ld1 {v3.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v2.8h, v9.8h, v2.8b\n"
+        "ld1 {v4.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v3.8h, v9.8h, v3.8b\n"
+        "ld1 {v5.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v4.8h, v9.8h, v4.8b\n"
+        "ld1 {v6.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v5.8h, v9.8h, v5.8b\n"
+        "ld1 {v7.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v6.8h, v9.8h, v6.8b\n"
+        "ld1 {v8.8b}, [%[filter_ptr]]\n"
+        "uaddw v7.8h, v9.8h, v7.8b\n"
+        "uaddw v8.8h, v9.8h, v8.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_HEIGHT_2_LOOP ":\n"
+          // Load the first two rows of the 5x5 input window, then reuse the
+          // same registers to load subsequent rows as they become available.
+          "mov x11, %[input_ptr]\n"
+          "mov x12, x11\n"
+          "add x13, x12, %[input_row_size]\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "mov w14, %w[output_window_width]\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          // The height 2 / width 2 loop loads an extra 1 output horizontally in
+          // anticipation for the next iteration. Make sure
+          // |output_window_width| is large enough to handle the additional
+          // load, otherwise jump to the appropriate label to handle smaller
+          // widths.
+          "cmp w14, #2\n"
+          "ld1 {v11.8b}, [x12], %[input_depth]\n"
+          "add x15, x13, %[input_row_size]\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "mov x6, %[output_ptr]\n"
+          "ld1 {v15.8b}, [x13], %[input_depth]\n"
+          "add x7, %[output_ptr], x19\n"
+          "ld1 {v16.8b}, [x13], %[input_depth]\n"
+          "ld1 {v21.4s}, [%[bias_ptr]]\n"
+          "ld1 {v22.4s}, [x10]\n"
+          "ld1 {v23.4s}, [%[bias_ptr]]\n"
+          "uaddw v9.8h, v28.8h, v9.8b\n"
+          "ld1 {v24.4s}, [x10]\n"
+          "uaddw v10.8h, v28.8h, v10.8b\n"
+          "ld1 {v19.4s}, [%[bias_ptr]]\n"
+          "uaddw v11.8h, v28.8h, v11.8b\n"
+          "ld1 {v20.4s}, [x10]\n"
+          "uaddw v14.8h, v28.8h, v14.8b\n"
+          "ld1 {v25.4s}, [%[bias_ptr]]\n"
+          "uaddw v15.8h, v28.8h, v15.8b\n"
+          "ld1 {v26.4s}, [x10]\n"
+          "uaddw v16.8h, v28.8h, v16.8b\n"
+
+          "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n"
+          "cmp w14, #1\n"
+          "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+          //"loop_%=:\n"
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP ":\n"
+            "smlal v21.4s, v0.4h, v9.4h\n"
+            "ld1 {v12.8b}, [x12], %[input_depth]\n"
+            "smlal2 v22.4s, v0.8h, v9.8h\n"
+            "ld1 {v13.8b}, [x12]\n"
+            "add x12, x15, %[input_row_size]\n"
+            "smlal v23.4s, v0.4h, v11.4h\n"
+            "ld1 {v17.8b}, [x13], %[input_depth]\n"
+            "smlal2 v24.4s, v0.8h, v11.8h\n"
+            "ld1 {v18.8b}, [x13]\n"
+            "add x13, x12, %[input_row_size]\n"
+            "smlal v21.4s, v1.4h, v10.4h\n"
+            "ld1 {v9.8b}, [x15], %[input_depth]\n"
+            "smlal2 v22.4s, v1.8h, v10.8h\n"
+            "ld1 {v10.8b}, [x15], %[input_depth]\n"
+            "smlal v21.4s, v2.4h, v11.4h\n"
+            "smlal2 v22.4s, v2.8h, v11.8h\n"
+            "ld1 {v11.8b}, [x15], %[input_depth]\n"
+            "smlal v21.4s, v3.4h, v14.4h\n"
+            "smlal2 v22.4s, v3.8h, v14.8h\n"
+            "ld1 {v14.8b}, [x12], %[input_depth]\n"
+            "smlal v23.4s, v3.4h, v16.4h\n"
+            "subs w14, w14, #2\n"
+            "smlal2 v24.4s, v3.8h, v16.8h\n"
+            "cmp w14, #3\n"
+            "smlal v21.4s, v4.4h, v15.4h\n"
+            "uaddw v12.8h, v28.8h, v12.8b\n"
+            "smlal2 v22.4s, v4.8h, v15.8h\n"
+            "ld1 {v15.8b}, [x12], %[input_depth]\n"
+            "smlal v21.4s, v5.4h, v16.4h\n"
+            "uaddw v13.8h, v28.8h, v13.8b\n"
+            "smlal2 v22.4s, v5.8h, v16.8h\n"
+            "ld1 {v16.8b}, [x12], %[input_depth]\n"
+            "smlal v23.4s, v1.4h, v12.4h\n"
+            "uaddw v17.8h, v28.8h, v17.8b\n"
+            "smlal2 v24.4s, v1.8h, v12.8h\n"
+            "ld1 {v12.8b}, [x15], %[input_depth]\n"
+            "smlal v23.4s, v2.4h, v13.4h\n"
+            "uaddw v18.8h, v28.8h, v18.8b\n"
+            "smlal2 v24.4s, v2.8h, v13.8h\n"
+            "ld1 {v13.8b}, [x15]\n"
+            "smlal v23.4s, v4.4h, v17.4h\n"
+            "uaddw v9.8h, v28.8h, v9.8b\n"
+            "smlal2 v24.4s, v4.8h, v17.8h\n"
+            "ld1 {v17.8b}, [x12], %[input_depth]\n"
+            "smlal v23.4s, v5.4h, v18.4h\n"
+            "uaddw v10.8h, v28.8h, v10.8b\n"
+            "smlal2 v24.4s, v5.8h, v18.8h\n"
+            "ld1 {v18.8b}, [x12]\n"
+
+            "smlal v21.4s, v6.4h, v9.4h\n"
+            "smlal2 v22.4s, v6.8h, v9.8h\n"
+            "smlal v19.4s, v0.4h, v9.4h\n"
+            "uaddw v11.8h, v28.8h, v11.8b\n"
+            "smlal2 v20.4s, v0.8h, v9.8h\n"
+            "ld1 {v9.8b}, [x13], %[input_depth]\n"
+            "smlal v23.4s, v6.4h, v11.4h\n"
+            "smlal2 v24.4s, v6.8h, v11.8h\n"
+            "smlal v21.4s, v7.4h, v10.4h\n"
+            "smlal2 v22.4s, v7.8h, v10.8h\n"
+            "uaddw v12.8h, v28.8h, v12.8b\n"
+            "smlal v19.4s, v1.4h, v10.4h\n"
+            "smlal2 v20.4s, v1.8h, v10.8h\n"
+            "ld1 {v10.8b}, [x13], %[input_depth]\n"
+            "smlal v23.4s, v7.4h, v12.4h\n"
+            "smlal2 v24.4s, v7.8h, v12.8h\n"
+            "smlal v25.4s, v1.4h, v12.4h\n"
+            "smlal2 v26.4s, v1.8h, v12.8h\n"
+            "smlal v21.4s, v8.4h, v11.4h\n"
+            "smlal2 v22.4s, v8.8h, v11.8h\n"
+            "add x11, x11, %[input_width_increment]\n"
+            "smlal v19.4s, v2.4h, v11.4h\n"
+            "mov x12, x11\n"
+            "smlal2 v20.4s, v2.8h, v11.8h\n"
+            "uaddw v13.8h, v28.8h, v13.8b\n"
+            "smlal v25.4s, v0.4h, v11.4h\n"
+            "smlal2 v26.4s, v0.8h, v11.8h\n"
+            "ld1 {v11.8b}, [x13], %[input_depth]\n"
+            "smlal v23.4s, v8.4h, v13.4h\n"
+            "ld1 {v12.8b}, [x13], %[input_depth]\n"
+            "smlal2 v24.4s, v8.8h, v13.8h\n"
+            "smlal v25.4s, v2.4h, v13.4h\n"
+            "smlal2 v26.4s, v2.8h, v13.8h\n"
+            "ld1 {v13.8b}, [x13]\n"
+            "add x13, x12, %[input_row_size]\n"
+            "add x15, x13, %[input_row_size]\n"
+
+            "dup v28.4s, w9\n"
+            "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+            "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+            "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+            "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+            "sqrshl v21.4s, v21.4s, v28.4s\n"
+            "sqrshl v22.4s, v22.4s, v28.4s\n"
+            "sqrshl v23.4s, v23.4s, v28.4s\n"
+            "sqrshl v24.4s, v24.4s, v28.4s\n"
+            "dup v28.8h, w0\n"
+            "sqxtn v21.4h, v21.4s\n"
+            "sqxtn2 v21.8h, v22.4s\n"
+            "sqxtn v23.4h, v23.4s\n"
+            "sqxtn2 v23.8h, v24.4s\n"
+            "sqadd v21.8h, v21.8h, v29.8h\n"
+            "sqadd v23.8h, v23.8h, v29.8h\n"
+            "sqxtun v21.8b, v21.8h\n"
+            "sqxtun2 v21.16b, v23.8h\n"
+            "ld1 {v22.4s}, [x10]\n"
+            "umax v21.16b, v21.16b, v30.16b\n"
+            "umin v21.16b, v21.16b, v31.16b\n"
+            "ld1 {v24.4s}, [x10]\n"
+            "uaddw v9.8h, v28.8h, v9.8b\n"
+            "st1 {v21.8b}, [x6], x5\n"
+            "uaddw v10.8h, v28.8h, v10.8b\n"
+            "mov v23.d[0], v21.d[1]\n"
+            "st1 {v23.8b}, [x6], x5\n"
+            "uaddw v11.8h, v28.8h, v11.8b\n"
+
+            "smlal v19.4s, v6.4h, v9.4h\n"
+            "smlal2 v20.4s, v6.8h, v9.8h\n"
+            "ld1 {v9.8b}, [x12], %[input_depth]\n"
+            "smlal v25.4s, v6.4h, v11.4h\n"
+            "smlal2 v26.4s, v6.8h, v11.8h\n"
+            "smlal v19.4s, v7.4h, v10.4h\n"
+            "uaddw v12.8h, v28.8h, v12.8b\n"
+            "smlal2 v20.4s, v7.8h, v10.8h\n"
+            "ld1 {v10.8b}, [x12], %[input_depth]\n"
+            "smlal v25.4s, v7.4h, v12.4h\n"
+            "smlal2 v26.4s, v7.8h, v12.8h\n"
+            "smlal v19.4s, v8.4h, v11.4h\n"
+            "uaddw v13.8h, v28.8h, v13.8b\n"
+            "smlal2 v20.4s, v8.8h, v11.8h\n"
+            "ld1 {v11.8b}, [x12], %[input_depth]\n"
+            "smlal v25.4s, v8.4h, v13.4h\n"
+            "uaddw v14.8h, v28.8h, v14.8b\n"
+            "smlal2 v26.4s, v8.8h, v13.8h\n"
+            "uaddw v16.8h, v28.8h, v16.8b\n"
+            "smlal v19.4s, v3.4h, v14.4h\n"
+            "uaddw v15.8h, v28.8h, v15.8b\n"
+            "smlal2 v20.4s, v3.8h, v14.8h\n"
+            "ld1 {v14.8b}, [x13], %[input_depth]\n"
+            "smlal v25.4s, v3.4h, v16.4h\n"
+            "ld1 {v21.4s}, [%[bias_ptr]]\n"
+            "smlal2 v26.4s, v3.8h, v16.8h\n"
+            "ld1 {v23.4s}, [%[bias_ptr]]\n"
+            "smlal v19.4s, v4.4h, v15.4h\n"
+            "uaddw v17.8h, v28.8h, v17.8b\n"
+            "smlal2 v20.4s, v4.8h, v15.8h\n"
+            "ld1 {v15.8b}, [x13], %[input_depth]\n"
+            "smlal v25.4s, v4.4h, v17.4h\n"
+            "smlal2 v26.4s, v4.8h, v17.8h\n"
+            "smlal v19.4s, v5.4h, v16.4h\n"
+            "uaddw v18.8h, v28.8h, v18.8b\n"
+            "smlal2 v20.4s, v5.8h, v16.8h\n"
+            "ld1 {v16.8b}, [x13], %[input_depth]\n"
+            "smlal v25.4s, v5.4h, v18.4h\n"
+            "smlal2 v26.4s, v5.8h, v18.8h\n"
+
+            "dup v28.4s, w9\n"
+            "sqrdmulh v19.4s, v19.4s, v27.4s\n"
+            "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+            "sqrdmulh v25.4s, v25.4s, v27.4s\n"
+            "sqrdmulh v26.4s, v26.4s, v27.4s\n"
+            "sqrshl v19.4s, v19.4s, v28.4s\n"
+            "sqrshl v20.4s, v20.4s, v28.4s\n"
+            "sqrshl v25.4s, v25.4s, v28.4s\n"
+            "sqrshl v26.4s, v26.4s, v28.4s\n"
+            "dup v28.8h, w0\n"
+            "sqxtn v19.4h, v19.4s\n"
+            "sqxtn2 v19.8h, v20.4s\n"
+            "sqxtn v25.4h, v25.4s\n"
+            "sqxtn2 v25.8h, v26.4s\n"
+            "sqadd v19.8h, v19.8h, v29.8h\n"
+            "sqadd v25.8h, v25.8h, v29.8h\n"
+            "sqxtun v19.8b, v19.8h\n"
+            "sqxtun2 v19.16b, v25.8h\n"
+            "ld1 {v20.4s}, [x10]\n"
+            "umax v19.16b, v19.16b, v30.16b\n"
+            "umin v19.16b, v19.16b, v31.16b\n"
+            "ld1 {v26.4s}, [x10]\n"
+            "uaddw v9.8h, v28.8h, v9.8b\n"
+            "st1 {v19.8b}, [x7], x5\n"
+            "uaddw v10.8h, v28.8h, v10.8b\n"
+            "mov v25.d[0], v19.d[1]\n"
+            "st1 {v25.8b}, [x7], x5\n"
+            "uaddw v11.8h, v28.8h, v11.8b\n"
+            "ld1 {v19.4s}, [%[bias_ptr]]\n"
+            "uaddw v14.8h, v28.8h, v14.8b\n"
+            "ld1 {v25.4s}, [%[bias_ptr]]\n"
+            "uaddw v15.8h, v28.8h, v15.8b\n"
+            "uaddw v16.8h, v28.8h, v16.8b\n"
+
+            "bge " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "b\n"
+
+          // At this point, there will be one of 2 width or 1 width leftover,
+          // not both.
+          "cmp w14, #2\n"
+          "blt " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+          // Handle last 2 columns if exists.
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER ":\n"
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "ld1 {v12.8b}, [x12], %[input_depth]\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "ld1 {v13.8b}, [x12]\n"
+          "add x12, x15, %[input_row_size]\n"
+          "smlal v23.4s, v0.4h, v11.4h\n"
+          "ld1 {v17.8b}, [x13], %[input_depth]\n"
+          "smlal2 v24.4s, v0.8h, v11.8h\n"
+          "ld1 {v18.8b}, [x13]\n"
+          "add x13, x12, %[input_row_size]\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "ld1 {v9.8b}, [x15], %[input_depth]\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x15], %[input_depth]\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x15], %[input_depth]\n"
+          "smlal v21.4s, v3.4h, v14.4h\n"
+          "smlal2 v22.4s, v3.8h, v14.8h\n"
+          "ld1 {v14.8b}, [x12], %[input_depth]\n"
+          "smlal v23.4s, v3.4h, v16.4h\n"
+          "smlal2 v24.4s, v3.8h, v16.8h\n"
+          "smlal v21.4s, v4.4h, v15.4h\n"
+          "uaddw v12.8h, v28.8h, v12.8b\n"
+          "smlal2 v22.4s, v4.8h, v15.8h\n"
+          "ld1 {v15.8b}, [x12], %[input_depth]\n"
+          "smlal v21.4s, v5.4h, v16.4h\n"
+          "uaddw v13.8h, v28.8h, v13.8b\n"
+          "smlal2 v22.4s, v5.8h, v16.8h\n"
+          "ld1 {v16.8b}, [x12], %[input_depth]\n"
+          "smlal v23.4s, v1.4h, v12.4h\n"
+          "uaddw v17.8h, v28.8h, v17.8b\n"
+          "smlal2 v24.4s, v1.8h, v12.8h\n"
+          "ld1 {v12.8b}, [x15], %[input_depth]\n"
+          "smlal v23.4s, v2.4h, v13.4h\n"
+          "uaddw v18.8h, v28.8h, v18.8b\n"
+          "smlal2 v24.4s, v2.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x15]\n"
+          "smlal v23.4s, v4.4h, v17.4h\n"
+          "uaddw v9.8h, v28.8h, v9.8b\n"
+          "smlal2 v24.4s, v4.8h, v17.8h\n"
+          "ld1 {v17.8b}, [x12], %[input_depth]\n"
+          "smlal v23.4s, v5.4h, v18.4h\n"
+          "uaddw v10.8h, v28.8h, v10.8b\n"
+          "smlal2 v24.4s, v5.8h, v18.8h\n"
+          "ld1 {v18.8b}, [x12]\n"
+
+          "smlal v21.4s, v6.4h, v9.4h\n"
+          "smlal2 v22.4s, v6.8h, v9.8h\n"
+          "smlal v19.4s, v0.4h, v9.4h\n"
+          "uaddw v11.8h, v28.8h, v11.8b\n"
+          "smlal2 v20.4s, v0.8h, v9.8h\n"
+          "ld1 {v9.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v6.4h, v11.4h\n"
+          "smlal2 v24.4s, v6.8h, v11.8h\n"
+          "smlal v21.4s, v7.4h, v10.4h\n"
+          "smlal2 v22.4s, v7.8h, v10.8h\n"
+          "uaddw v12.8h, v28.8h, v12.8b\n"
+          "smlal v19.4s, v1.4h, v10.4h\n"
+          "smlal2 v20.4s, v1.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v7.4h, v12.4h\n"
+          "smlal2 v24.4s, v7.8h, v12.8h\n"
+          "smlal v25.4s, v1.4h, v12.4h\n"
+          "smlal2 v26.4s, v1.8h, v12.8h\n"
+          "smlal v21.4s, v8.4h, v11.4h\n"
+          "smlal2 v22.4s, v8.8h, v11.8h\n"
+          "smlal v19.4s, v2.4h, v11.4h\n"
+          "smlal2 v20.4s, v2.8h, v11.8h\n"
+          "uaddw v13.8h, v28.8h, v13.8b\n"
+          "smlal v25.4s, v0.4h, v11.4h\n"
+          "smlal2 v26.4s, v0.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v8.4h, v13.4h\n"
+          "ld1 {v12.8b}, [x13], %[input_depth]\n"
+          "smlal2 v24.4s, v8.8h, v13.8h\n"
+          "smlal v25.4s, v2.4h, v13.4h\n"
+          "smlal2 v26.4s, v2.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x13]\n"
+
+          "dup v28.4s, w9\n"
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "sqrshl v21.4s, v21.4s, v28.4s\n"
+          "sqrshl v22.4s, v22.4s, v28.4s\n"
+          "sqrshl v23.4s, v23.4s, v28.4s\n"
+          "sqrshl v24.4s, v24.4s, v28.4s\n"
+          "dup v28.8h, w0\n"
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqadd v21.8h, v21.8h, v29.8h\n"
+          "sqadd v23.8h, v23.8h, v29.8h\n"
+          "sqxtun v21.8b, v21.8h\n"
+          "sqxtun2 v21.16b, v23.8h\n"
+          "ld1 {v22.4s}, [x10]\n"
+          "umax v21.16b, v21.16b, v30.16b\n"
+          "umin v21.16b, v21.16b, v31.16b\n"
+          "ld1 {v24.4s}, [x10]\n"
+          "uaddw v9.8h, v28.8h, v9.8b\n"
+          "st1 {v21.8b}, [x6], x5\n"
+          "uaddw v10.8h, v28.8h, v10.8b\n"
+          "mov v23.d[0], v21.d[1]\n"
+          "st1 {v23.8b}, [x6]\n"
+          "uaddw v11.8h, v28.8h, v11.8b\n"
+
+          "smlal v19.4s, v6.4h, v9.4h\n"
+          "smlal2 v20.4s, v6.8h, v9.8h\n"
+          "smlal v25.4s, v6.4h, v11.4h\n"
+          "smlal2 v26.4s, v6.8h, v11.8h\n"
+          "smlal v19.4s, v7.4h, v10.4h\n"
+          "uaddw v12.8h, v28.8h, v12.8b\n"
+          "smlal2 v20.4s, v7.8h, v10.8h\n"
+          "smlal v25.4s, v7.4h, v12.4h\n"
+          "smlal2 v26.4s, v7.8h, v12.8h\n"
+          "smlal v19.4s, v8.4h, v11.4h\n"
+          "uaddw v13.8h, v28.8h, v13.8b\n"
+          "smlal2 v20.4s, v8.8h, v11.8h\n"
+          "smlal v25.4s, v8.4h, v13.4h\n"
+          "uaddw v14.8h, v28.8h, v14.8b\n"
+          "smlal2 v26.4s, v8.8h, v13.8h\n"
+          "uaddw v16.8h, v28.8h, v16.8b\n"
+          "smlal v19.4s, v3.4h, v14.4h\n"
+          "uaddw v15.8h, v28.8h, v15.8b\n"
+          "smlal2 v20.4s, v3.8h, v14.8h\n"
+          "smlal v25.4s, v3.4h, v16.4h\n"
+          "smlal2 v26.4s, v3.8h, v16.8h\n"
+          "smlal v19.4s, v4.4h, v15.4h\n"
+          "uaddw v17.8h, v28.8h, v17.8b\n"
+          "smlal2 v20.4s, v4.8h, v15.8h\n"
+          "smlal v25.4s, v4.4h, v17.4h\n"
+          "smlal2 v26.4s, v4.8h, v17.8h\n"
+          "smlal v19.4s, v5.4h, v16.4h\n"
+          "uaddw v18.8h, v28.8h, v18.8b\n"
+          "smlal2 v20.4s, v5.8h, v16.8h\n"
+          "smlal v25.4s, v5.4h, v18.4h\n"
+          "smlal2 v26.4s, v5.8h, v18.8h\n"
+
+          "dup v28.4s, w9\n"
+          "sqrdmulh v19.4s, v19.4s, v27.4s\n"
+          "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+          "sqrdmulh v25.4s, v25.4s, v27.4s\n"
+          "sqrdmulh v26.4s, v26.4s, v27.4s\n"
+          "sqrshl v19.4s, v19.4s, v28.4s\n"
+          "sqrshl v20.4s, v20.4s, v28.4s\n"
+          "sqrshl v25.4s, v25.4s, v28.4s\n"
+          "sqrshl v26.4s, v26.4s, v28.4s\n"
+          "dup v28.8h, w0\n"
+          "sqxtn v19.4h, v19.4s\n"
+          "sqxtn2 v19.8h, v20.4s\n"
+          "sqxtn v25.4h, v25.4s\n"
+          "sqxtn2 v25.8h, v26.4s\n"
+          "sqadd v19.8h, v19.8h, v29.8h\n"
+          "sqadd v25.8h, v25.8h, v29.8h\n"
+          "sqxtun v19.8b, v19.8h\n"
+          "sqxtun2 v19.16b, v25.8h\n"
+          "umax v19.16b, v19.16b, v30.16b\n"
+          "umin v19.16b, v19.16b, v31.16b\n"
+          "st1 {v19.8b}, [x7], x5\n"
+          "mov v25.d[0], v19.d[1]\n"
+          "st1 {v25.8b}, [x7]\n"
+          "b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
+
+          // Handle last column if exists.
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER ":\n"
+          // Registers v9, v10, v11, v14, v15, and v16 have already been loaded
+          // with the correct values at this point. This corresponds to the
+          // first two input rows of the top left output. Now load the last
+          // input row for this output. Once these inputs are no longer needed,
+          // load the input rows for the bottom left output.
+          "add x12, x15, %[input_row_size]\n"
+          "add x13, x12, %[input_row_size]\n"
+
+          "ld1 {v12.8b}, [x15], %[input_depth]\n"
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "ld1 {v13.8b}, [x15], %[input_depth]\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "ld1 {v17.8b}, [x15]\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x12]\n"
+          "smlal v21.4s, v3.4h, v14.4h\n"
+          "smlal2 v22.4s, v3.8h, v14.8h\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "smlal v21.4s, v4.4h, v15.4h\n"
+          "smlal2 v22.4s, v4.8h, v15.8h\n"
+          "ld1 {v15.8b}, [x13], %[input_depth]\n"
+          "smlal v21.4s, v5.4h, v16.4h\n"
+          "uaddw v12.8h, v28.8h, v12.8b\n"
+          "smlal2 v22.4s, v5.8h, v16.8h\n"
+          "uaddw v13.8h, v28.8h, v13.8b\n"
+          "ld1 {v16.8b}, [x13]\n"
+
+          "smlal v21.4s, v6.4h, v12.4h\n"
+          "smlal2 v22.4s, v6.8h, v12.8h\n"
+          "smlal v23.4s, v0.4h, v12.4h\n"
+          "uaddw v17.8h, v28.8h, v17.8b\n"
+          "smlal2 v24.4s, v0.8h, v12.8h\n"
+          "smlal v21.4s, v7.4h, v13.4h\n"
+          "smlal2 v22.4s, v7.8h, v13.8h\n"
+          "smlal v23.4s, v1.4h, v13.4h\n"
+          "smlal2 v24.4s, v1.8h, v13.8h\n"
+          "smlal v21.4s, v8.4h, v17.4h\n"
+          "smlal2 v22.4s, v8.8h, v17.8h\n"
+          "smlal v23.4s, v2.4h, v17.4h\n"
+          "smlal2 v24.4s, v2.8h, v17.8h\n"
+
+          "dup v26.4s, w9\n"
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrshl v21.4s, v21.4s, v26.4s\n"
+          "sqrshl v22.4s, v22.4s, v26.4s\n"
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqadd v21.8h, v21.8h, v29.8h\n"
+          "sqxtun v21.8b, v21.8h\n"
+          "umax v21.8b, v21.8b, v30.8b\n"
+          "umin v21.8b, v21.8b, v31.8b\n"
+          "uaddw v9.8h, v28.8h, v9.8b\n"
+          "st1 {v21.8b}, [x6]\n"
+          "uaddw v10.8h, v28.8h, v10.8b\n"
+
+          "smlal v23.4s, v3.4h, v9.4h\n"
+          "uaddw v11.8h, v28.8h, v11.8b\n"
+          "smlal2 v24.4s, v3.8h, v9.8h\n"
+          "uaddw v14.8h, v28.8h, v14.8b\n"
+          "smlal v23.4s, v4.4h, v10.4h\n"
+          "uaddw v15.8h, v28.8h, v15.8b\n"
+          "smlal2 v24.4s, v4.8h, v10.8h\n"
+          "uaddw v16.8h, v28.8h, v16.8b\n"
+          "smlal v23.4s, v5.4h, v11.4h\n"
+          "smlal2 v24.4s, v5.8h, v11.8h\n"
+
+          "smlal v23.4s, v6.4h, v14.4h\n"
+          "smlal2 v24.4s, v6.8h, v14.8h\n"
+          "smlal v23.4s, v7.4h, v15.4h\n"
+          "smlal2 v24.4s, v7.8h, v15.8h\n"
+          "smlal v23.4s, v8.4h, v16.4h\n"
+          "smlal2 v24.4s, v8.8h, v16.8h\n"
+
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "sqrshl v23.4s, v23.4s, v26.4s\n"
+          "sqrshl v24.4s, v24.4s, v26.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqadd v23.8h, v23.8h, v29.8h\n"
+          "sqxtun v23.8b, v23.8h\n"
+          "umax v23.8b, v23.8b, v30.8b\n"
+          "umin v23.8b, v23.8b, v31.8b\n"
+          "st1 {v23.8b}, [x7]\n"
+
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
+          "subs %w[output_window_height], %w[output_window_height], #2\n"
+          "add %[input_ptr], %[input_ptr], %[input_height_increment]\n"
+          "cmp %w[output_window_height], #2\n"
+          "add %[output_ptr], %[output_ptr], %[output_height_increment]\n"
+          "bge " DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP ":\n"
+        "cmp %w[output_window_height], #1\n"
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_1 ":\n"
+        "mov x11, %[input_ptr]\n"
+        "mov x12, x11\n"
+        "add x13, x12, %[input_row_size]\n"
+        "ld1 {v9.8b}, [x12], %[input_depth]\n"
+        "add x15, x13, %[input_row_size]\n"
+        "ld1 {v10.8b}, [x12], %[input_depth]\n"
+        "mov x6, %[output_ptr]\n"
+        "ld1 {v11.8b}, [x12], %[input_depth]\n"
+        "mov w14, %w[output_window_width]\n"
+        // The height 1 / width 2 loop loads an extra 1x1 output in anticipation
+        // for the next iteration. Make sure |output_window_width| is large
+        // enough to handle the additional load, otherwise jump to the
+        // appropriate label to handle smaller widths.
+        "cmp w14, #2\n"
+        "ld1 {v12.8b}, [x13], %[input_depth]\n"
+        "ld1 {v13.8b}, [x13], %[input_depth]\n"
+        "ld1 {v14.8b}, [x13], %[input_depth]\n"
+        "ld1 {v15.8b}, [x15], %[input_depth]\n"
+        "ld1 {v16.8b}, [x15], %[input_depth]\n"
+        "ld1 {v17.8b}, [x15], %[input_depth]\n"
+
+        "uaddw v9.8h, v28.8h, v9.8b\n"
+        "ld1 {v24.4s}, [%[bias_ptr]]\n"
+        "uaddw v10.8h, v28.8h, v10.8b\n"
+        "ld1 {v25.4s}, [x10]\n"
+        "uaddw v11.8h, v28.8h, v11.8b\n"
+        "ld1 {v26.4s}, [%[bias_ptr]]\n"
+        "ld1 {v27.4s}, [x10]\n"
+        "uaddw v12.8h, v28.8h, v12.8b\n"
+        "uaddw v13.8h, v28.8h, v13.8b\n"
+        "uaddw v14.8h, v28.8h, v14.8b\n"
+        "uaddw v15.8h, v28.8h, v15.8b\n"
+        "uaddw v16.8h, v28.8h, v16.8b\n"
+        "uaddw v17.8h, v28.8h, v17.8b\n"
+
+        "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "f\n"
+        "cmp w14, #1\n"
+        "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP ":\n"
+          "smlal v24.4s, v0.4h, v9.4h\n"
+          "ld1 {v18.8b}, [x12], %[input_depth]\n"
+          "smlal2 v25.4s, v0.8h, v9.8h\n"
+          "ld1 {v19.8b}, [x12]\n"
+          "smlal v26.4s, v0.4h, v11.4h\n"
+          "ld1 {v20.8b}, [x13], %[input_depth]\n"
+          "smlal2 v27.4s, v0.8h, v11.8h\n"
+          "ld1 {v21.8b}, [x13]\n"
+          "smlal v24.4s, v1.4h, v10.4h\n"
+          "ld1 {v22.8b}, [x15], %[input_depth]\n"
+          "smlal2 v25.4s, v1.8h, v10.8h\n"
+          "ld1 {v23.8b}, [x15]\n"
+          "smlal v24.4s, v2.4h, v11.4h\n"
+          "subs w14, w14, #2\n"
+          "smlal2 v25.4s, v2.8h, v11.8h\n"
+          "cmp w14, #3\n"
+          "smlal v24.4s, v3.4h, v12.4h\n"
+          "add x11, x11, %[input_width_increment]\n"
+          "smlal2 v25.4s, v3.8h, v12.8h\n"
+          "mov x12, x11\n"
+          "smlal v26.4s, v3.4h, v14.4h\n"
+          "add x13, x12, %[input_row_size]\n"
+          "smlal2 v27.4s, v3.8h, v14.8h\n"
+          "add x15, x13, %[input_row_size]\n"
+          "smlal v24.4s, v4.4h, v13.4h\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "smlal2 v25.4s, v4.8h, v13.8h\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          "smlal v24.4s, v5.4h, v14.4h\n"
+          "ld1 {v11.8b}, [x12], %[input_depth]\n"
+          "smlal2 v25.4s, v5.8h, v14.8h\n"
+          "ld1 {v12.8b}, [x13], %[input_depth]\n"
+          "smlal v24.4s, v6.4h, v15.4h\n"
+          "ld1 {v13.8b}, [x13], %[input_depth]\n"
+          "smlal2 v25.4s, v6.8h, v15.8h\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "smlal v26.4s, v6.4h, v17.4h\n"
+          "ld1 {v15.8b}, [x15], %[input_depth]\n"
+          "smlal2 v27.4s, v6.8h, v17.8h\n"
+          "smlal v24.4s, v7.4h, v16.4h\n"
+          "smlal2 v25.4s, v7.8h, v16.8h\n"
+          "ld1 {v16.8b}, [x15], %[input_depth]\n"
+          "smlal v24.4s, v8.4h, v17.4h\n"
+          "uaddw v18.8h, v28.8h, v18.8b\n"
+          "smlal2 v25.4s, v8.8h, v17.8h\n"
+          "ld1 {v17.8b}, [x15], %[input_depth]\n"
+          "uaddw v19.8h, v28.8h, v19.8b\n"
+
+          "smlal v26.4s, v1.4h, v18.4h\n"
+          "uaddw v20.8h, v28.8h, v20.8b\n"
+          "smlal2 v27.4s, v1.8h, v18.8h\n"
+          "smlal v26.4s, v2.4h, v19.4h\n"
+          "uaddw v21.8h, v28.8h, v21.8b\n"
+          "smlal2 v27.4s, v2.8h, v19.8h\n"
+          "smlal v26.4s, v4.4h, v20.4h\n"
+          "smlal v26.4s, v5.4h, v21.4h\n"
+          "smlal2 v27.4s, v4.8h, v20.8h\n"
+          "uaddw v22.8h, v28.8h, v22.8b\n"
+          "smlal2 v27.4s, v5.8h, v21.8h\n"
+          "uaddw v23.8h, v28.8h, v23.8b\n"
+          "smlal v26.4s, v7.4h, v22.4h\n"
+          "smlal2 v27.4s, v7.8h, v22.8h\n"
+          "smlal v26.4s, v8.4h, v23.4h\n"
+          "smlal2 v27.4s, v8.8h, v23.8h\n"
+
+          "dup v28.4s, w1\n"
+          "dup v29.4s, w9\n"
+          "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+          "sqrdmulh v25.4s, v25.4s, v28.4s\n"
+          "sqrdmulh v26.4s, v26.4s, v28.4s\n"
+          "sqrdmulh v27.4s, v27.4s, v28.4s\n"
+          "dup v28.8h, w2\n"
+          "sqrshl v24.4s, v24.4s, v29.4s\n"
+          "sqrshl v25.4s, v25.4s, v29.4s\n"
+          "sqrshl v26.4s, v26.4s, v29.4s\n"
+          "sqrshl v27.4s, v27.4s, v29.4s\n"
+          "sqxtn v24.4h, v24.4s\n"
+          "sqxtn2 v24.8h, v25.4s\n"
+          "sqxtn v26.4h, v26.4s\n"
+          "sqxtn2 v26.8h, v27.4s\n"
+          "sqadd v24.8h, v24.8h, v28.8h\n"
+          "sqadd v26.8h, v26.8h, v28.8h\n"
+          "sqxtun v24.8b, v24.8h\n"
+          "sqxtun2 v24.16b, v26.8h\n"
+          "dup v28.8h, w0\n"
+          "ld1 {v25.4s}, [x10]\n"
+          "umax v24.16b, v24.16b, v30.16b\n"
+          "umin v24.16b, v24.16b, v31.16b\n"
+          "ld1 {v27.4s}, [x10]\n"
+          "uaddw v9.8h, v28.8h, v9.8b\n"
+          "st1 {v24.8b}, [x6], x5\n"
+          "uaddw v10.8h, v28.8h, v10.8b\n"
+          "mov v26.d[0], v24.d[1]\n"
+          "st1 {v26.8b}, [x6], x5\n"
+          "uaddw v11.8h, v28.8h, v11.8b\n"
+          "uaddw v12.8h, v28.8h, v12.8b\n"
+          "uaddw v13.8h, v28.8h, v13.8b\n"
+          "uaddw v14.8h, v28.8h, v14.8b\n"
+          "ld1 {v24.4s}, [%[bias_ptr]]\n"
+          "uaddw v15.8h, v28.8h, v15.8b\n"
+          "ld1 {v26.4s}, [%[bias_ptr]]\n"
+          "uaddw v16.8h, v28.8h, v16.8b\n"
+          "uaddw v17.8h, v28.8h, v17.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "b\n"
+
+        // At this point, there will be one of 2 width or 1 width leftover,
+        // not both.
+        "cmp w14, #2\n"
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+        // Handle last two horizontal outputs if exists.
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER ":\n"
+        "smlal v24.4s, v0.4h, v9.4h\n"
+        "ld1 {v18.8b}, [x12], %[input_depth]\n"
+        "smlal2 v25.4s, v0.8h, v9.8h\n"
+        "ld1 {v19.8b}, [x12]\n"
+        "smlal v26.4s, v0.4h, v11.4h\n"
+        "ld1 {v20.8b}, [x13], %[input_depth]\n"
+        "smlal2 v27.4s, v0.8h, v11.8h\n"
+        "ld1 {v21.8b}, [x13]\n"
+        "smlal v24.4s, v1.4h, v10.4h\n"
+        "ld1 {v22.8b}, [x15], %[input_depth]\n"
+        "smlal2 v25.4s, v1.8h, v10.8h\n"
+        "ld1 {v23.8b}, [x15]\n"
+        "smlal v24.4s, v2.4h, v11.4h\n"
+        "smlal2 v25.4s, v2.8h, v11.8h\n"
+        "smlal v24.4s, v3.4h, v12.4h\n"
+        "smlal2 v25.4s, v3.8h, v12.8h\n"
+        "smlal v26.4s, v3.4h, v14.4h\n"
+        "smlal2 v27.4s, v3.8h, v14.8h\n"
+        "smlal v24.4s, v4.4h, v13.4h\n"
+        "smlal2 v25.4s, v4.8h, v13.8h\n"
+        "smlal v24.4s, v5.4h, v14.4h\n"
+        "smlal2 v25.4s, v5.8h, v14.8h\n"
+        "smlal v24.4s, v6.4h, v15.4h\n"
+        "smlal2 v25.4s, v6.8h, v15.8h\n"
+        "smlal v26.4s, v6.4h, v17.4h\n"
+        "smlal2 v27.4s, v6.8h, v17.8h\n"
+        "smlal v24.4s, v7.4h, v16.4h\n"
+        "smlal2 v25.4s, v7.8h, v16.8h\n"
+        "smlal v24.4s, v8.4h, v17.4h\n"
+        "uaddw v18.8h, v28.8h, v18.8b\n"
+        "smlal2 v25.4s, v8.8h, v17.8h\n"
+        "uaddw v19.8h, v28.8h, v19.8b\n"
+
+        "smlal v26.4s, v1.4h, v18.4h\n"
+        "uaddw v20.8h, v28.8h, v20.8b\n"
+        "smlal2 v27.4s, v1.8h, v18.8h\n"
+        "smlal v26.4s, v2.4h, v19.4h\n"
+        "uaddw v21.8h, v28.8h, v21.8b\n"
+        "smlal2 v27.4s, v2.8h, v19.8h\n"
+        "smlal v26.4s, v4.4h, v20.4h\n"
+        "smlal v26.4s, v5.4h, v21.4h\n"
+        "smlal2 v27.4s, v4.8h, v20.8h\n"
+        "uaddw v22.8h, v28.8h, v22.8b\n"
+        "smlal2 v27.4s, v5.8h, v21.8h\n"
+        "uaddw v23.8h, v28.8h, v23.8b\n"
+        "smlal v26.4s, v7.4h, v22.4h\n"
+        "smlal2 v27.4s, v7.8h, v22.8h\n"
+        "smlal v26.4s, v8.4h, v23.4h\n"
+        "smlal2 v27.4s, v8.8h, v23.8h\n"
+
+        "dup v28.4s, w1\n"
+        "dup v29.4s, w9\n"
+        "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+        "sqrdmulh v25.4s, v25.4s, v28.4s\n"
+        "sqrdmulh v26.4s, v26.4s, v28.4s\n"
+        "sqrdmulh v27.4s, v27.4s, v28.4s\n"
+        "dup v28.8h, w2\n"
+        "sqrshl v24.4s, v24.4s, v29.4s\n"
+        "sqrshl v25.4s, v25.4s, v29.4s\n"
+        "sqrshl v26.4s, v26.4s, v29.4s\n"
+        "sqrshl v27.4s, v27.4s, v29.4s\n"
+        "sqxtn v24.4h, v24.4s\n"
+        "sqxtn2 v24.8h, v25.4s\n"
+        "sqxtn v26.4h, v26.4s\n"
+        "sqxtn2 v26.8h, v27.4s\n"
+        "sqadd v24.8h, v24.8h, v28.8h\n"
+        "sqadd v26.8h, v26.8h, v28.8h\n"
+        "sqxtun v24.8b, v24.8h\n"
+        "sqxtun2 v24.16b, v26.8h\n"
+        "dup v28.8h, w0\n"
+        "umax v24.16b, v24.16b, v30.16b\n"
+        "umin v24.16b, v24.16b, v31.16b\n"
+        "st1 {v24.8b}, [x6], x5\n"
+        "mov v26.d[0], v24.d[1]\n"
+        "st1 {v26.8b}, [x6]\n"
+        "b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+        // Handle bottom right output if exists.
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
+        "dup v26.4s, w9\n"
+        "dup v27.4s, w1\n"
+        "dup v29.8h, w2\n"
+
+        "smlal v24.4s, v0.4h, v9.4h\n"
+        "smlal2 v25.4s, v0.8h, v9.8h\n"
+        "smlal v24.4s, v1.4h, v10.4h\n"
+        "smlal2 v25.4s, v1.8h, v10.8h\n"
+        "smlal v24.4s, v2.4h, v11.4h\n"
+        "smlal2 v25.4s, v2.8h, v11.8h\n"
+        "smlal v24.4s, v3.4h, v12.4h\n"
+        "smlal2 v25.4s, v3.8h, v12.8h\n"
+        "smlal v24.4s, v4.4h, v13.4h\n"
+        "smlal2 v25.4s, v4.8h, v13.8h\n"
+        "smlal v24.4s, v5.4h, v14.4h\n"
+        "smlal2 v25.4s, v5.8h, v14.8h\n"
+        "smlal v24.4s, v6.4h, v15.4h\n"
+        "smlal2 v25.4s, v6.8h, v15.8h\n"
+        "smlal v24.4s, v7.4h, v16.4h\n"
+        "smlal2 v25.4s, v7.8h, v16.8h\n"
+        "smlal v24.4s, v8.4h, v17.4h\n"
+        "smlal2 v25.4s, v8.8h, v17.8h\n"
+
+        "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+        "sqrdmulh v25.4s, v25.4s, v27.4s\n"
+        "sqrshl v24.4s, v24.4s, v26.4s\n"
+        "sqrshl v25.4s, v25.4s, v26.4s\n"
+        "sqxtn v24.4h, v24.4s\n"
+        "sqxtn2 v24.8h, v25.4s\n"
+        "sqadd v24.8h, v24.8h, v29.8h\n"
+        "sqxtun v24.8b, v24.8h\n"
+        "umax v24.8b, v24.8b, v30.8b\n"
+        "umin v24.8b, v24.8b, v31.8b\n"
+        "st1 {v24.8b}, [x6]\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
+    :
+    // Outputs.
+    [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+    [output_ptr] "+r"(output_ptr),
+    [output_window_height] "+r"(output_window_height)
+    :
+    // Inputs.
+    [bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
+    [input_depth] "r"(input_depth),
+    [output_window_width] "r"(output_window_width),
+    [input_width_increment] "r"(input_width_increment),
+    [input_height_increment] "r"(input_height_increment),
+    [output_height_increment] "r"(output_height_increment),
+    [params_ptr] "r"(params_ptr)
+    :
+    // Clobbers.
+    "cc", "memory",
+    // We use these NEON registers.
+    "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+    "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+    "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+    "v30", "v31",
+    // We use these general-purpose registers.
+    "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+    "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+    "x19", "x20");
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_END
+  }
+};
+
+template <>
+struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
+                                      EdgeType::kCenter, 1, 1> {
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         const DepthwiseConvParams* params_ptr) {
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 1x1 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the 1x1 input and filter values.
+        "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr x11, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "dup v26.8h, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w10\n"
+        "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+        "cmp x11, #16\n"
+        "ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
+        "dup v28.8h, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "dup v29.4s, w10\n"
+        "ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.16b, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+        "dup v31.16b, w10\n"
+        "dup v25.8h, w9\n"
+
+        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v8.8h, v26.8h, v8.8b\n"
+        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v0.8h, v25.8h, v0.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "subs x11, x11, #8\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+          "cmp x11, #16\n"
+          "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+
+          "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+          "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+          "sqrshl v16.4s, v16.4s, v29.4s\n"
+          "sqrshl v17.4s, v17.4s, v29.4s\n"
+          "sqxtn v16.4h, v16.4s\n"
+          "sqxtn2 v16.8h, v17.4s\n"
+          "sqadd v16.8h, v16.8h, v28.8h\n"
+          "sqxtun v16.8b, v16.8h\n"
+          "umax v16.8b, v16.8b, v30.8b\n"
+          "umin v16.8b, v16.8b, v31.8b\n"
+          "st1 {v16.8b}, [%[output_ptr]], #8\n"
+          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v0.8h, v25.8h, v0.8b\n"
+          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+
+        "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+        "sqrshl v16.4s, v16.4s, v29.4s\n"
+        "sqrshl v17.4s, v17.4s, v29.4s\n"
+
+        "sqxtn v16.4h, v16.4s\n"
+        "sqxtn2 v16.8h, v17.4s\n"
+        "sqadd v16.8h, v16.8h, v28.8h\n"
+        "sqxtun v16.8b, v16.8h\n"
+        "umax v16.8b, v16.8b, v30.8b\n"
+        "umin v16.8b, v16.8b, v31.8b\n"
+        "st1 {v16.8b}, [%[output_ptr]]\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
+        :
+        // Inputs.
+        [params_ptr] "r"(params_ptr)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v8", "v16", "v17", "v18", "v19", "v25", "v26", "v27", "v28",
+        "v29", "v30", "v31",
+        // We use these general-purpose registers.
+        "x9", "x10", "x11");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
+  }
+};
+
+template <>
+struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
+                                      EdgeType::kCorner, 1, 1> {
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         const DepthwiseConvParams* params_ptr) {
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 2x2 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the beginning of the 2x2 input and
+        // filter values.
+
+        // Load input and filter values.
+        "ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "ldr x9, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
+        "cmp x15, #16\n"
+        "add x12, %[input_ptr], x15\n"
+        "add x13, %[input_ptr], x9\n"
+        "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+        "add x14, x13, x15\n"
+        "ld1 {v9.8b}, [x12], #8\n"
+        "ldr x6, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
+
+        "add x9, %[filter_ptr], x15\n"
+        "ld1 {v10.8b}, [x13], #8\n"
+        "add x10, %[filter_ptr], x6\n"
+        "ld1 {v11.8b}, [x14], #8\n"
+        "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+        "add x11, x10, x15\n"
+        "ld1 {v1.8b}, [x9], #8\n"
+        "ld1 {v2.8b}, [x10], #8\n"
+        "ld1 {v3.8b}, [x11], #8\n"
+
+        // Load constants.
+        "ldr w6, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "dup v26.8h, w6\n"
+        "ldr w6, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w7\n"
+        "ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
+        "dup v28.8h, w6\n"
+        "ldr w6, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "dup v29.4s, w7\n"
+        "ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.16b, w6\n"
+        "ldr w6, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+        "dup v31.16b, w7\n"
+        "dup v25.8h, w6\n"
+
+        // Add input and filter offsets.
+        "uaddw v8.8h, v26.8h, v8.8b\n"
+        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v9.8h, v26.8h, v9.8b\n"
+        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v10.8h, v26.8h, v10.8b\n"
+        "uaddw v11.8h, v26.8h, v11.8b\n"
+
+        "uaddw v0.8h, v25.8h, v0.8b\n"
+        "uaddw v1.8h, v25.8h, v1.8b\n"
+        "uaddw v2.8h, v25.8h, v2.8b\n"
+        "uaddw v3.8h, v25.8h, v3.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "subs x15, x15, #8\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+          "cmp x15, #16\n"
+          "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+          "smlal v16.4s, v1.4h, v9.4h\n"
+          "smlal2 v17.4s, v1.8h, v9.8h\n"
+          "ld1 {v9.8b}, [x12], #8\n"
+          "smlal v16.4s, v2.4h, v10.4h\n"
+          "ld1 {v1.8b}, [x9], #8\n"
+          "smlal2 v17.4s, v2.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x13], #8\n"
+          "smlal v16.4s, v3.4h, v11.4h\n"
+          "ld1 {v2.8b}, [x10], #8\n"
+          "smlal2 v17.4s, v3.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x14], #8\n"
+          "ld1 {v3.8b}, [x11], #8\n"
+
+          "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+          "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+          "sqrshl v16.4s, v16.4s, v29.4s\n"
+          "sqrshl v17.4s, v17.4s, v29.4s\n"
+          "sqxtn v16.4h, v16.4s\n"
+          "sqxtn2 v16.8h, v17.4s\n"
+          "sqadd v16.8h, v16.8h, v28.8h\n"
+          "sqxtun v16.8b, v16.8h\n"
+          "umax v16.8b, v16.8b, v30.8b\n"
+          "umin v16.8b, v16.8b, v31.8b\n"
+          "st1 {v16.8b}, [%[output_ptr]], #8\n"
+          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v10.8h, v26.8h, v10.8b\n"
+          "uaddw v11.8h, v26.8h, v11.8b\n"
+          "uaddw v0.8h, v25.8h, v0.8b\n"
+          "uaddw v1.8h, v25.8h, v1.8b\n"
+          "uaddw v2.8h, v25.8h, v2.8b\n"
+          "uaddw v3.8h, v25.8h, v3.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+        "smlal v16.4s, v1.4h, v9.4h\n"
+        "smlal2 v17.4s, v1.8h, v9.8h\n"
+        "smlal v16.4s, v2.4h, v10.4h\n"
+        "smlal2 v17.4s, v2.8h, v10.8h\n"
+        "smlal v16.4s, v3.4h, v11.4h\n"
+        "smlal2 v17.4s, v3.8h, v11.8h\n"
+
+        "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+        "sqrshl v16.4s, v16.4s, v29.4s\n"
+        "sqrshl v17.4s, v17.4s, v29.4s\n"
+
+        "sqxtn v16.4h, v16.4s\n"
+        "sqxtn2 v16.8h, v17.4s\n"
+        "sqadd v16.8h, v16.8h, v28.8h\n"
+        "sqxtun v16.8b, v16.8h\n"
+        "umax v16.8b, v16.8b, v30.8b\n"
+        "umin v16.8b, v16.8b, v31.8b\n"
+        "st1 {v16.8b}, [%[output_ptr]]\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
+        :
+        // Inputs.
+        [params_ptr] "r"(params_ptr)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v16", "v17", "v18",
+        "v19", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+        // We use these general-purpose registers.
+        "x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
+  }
+};
+
+template <>
+struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
+                                      EdgeType::kHorizontal, 1, 1> {
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         const DepthwiseConvParams* params_ptr) {
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 2x3 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the beginning of the 2x3 input and
+        // filter values.
+
+        // Load input and filter values.
+        "ldr x7, [%[params_ptr], #" STR(OFFSET_INPUT_DEPTH) "]\n"
+        "mov x12, %[input_ptr]\n"
+        "ldr x11, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
+        "mov x9, %[filter_ptr]\n"
+        "ldr x14, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
+        "add x13, x12, x11\n"
+        "ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+
+        "ld1 {v8.8b}, [x12], x7\n"
+        "add x10, x9, x14\n"
+        "ld1 {v9.8b}, [x12], x7\n"
+        "cmp x15, #16\n"
+        "ld1 {v10.8b}, [x12]\n"
+        "add %[input_ptr], %[input_ptr], #8\n"
+        "ld1 {v11.8b}, [x13], x7\n"
+        "add %[filter_ptr], %[filter_ptr], #8\n"
+        "ld1 {v12.8b}, [x13], x7\n"
+        "ld1 {v13.8b}, [x13]\n"
+
+        "ld1 {v0.8b}, [x9], x7\n"
+        "ld1 {v1.8b}, [x9], x7\n"
+        "ld1 {v2.8b}, [x9]\n"
+        "ld1 {v3.8b}, [x10], x7\n"
+        "ld1 {v4.8b}, [x10], x7\n"
+        "ld1 {v5.8b}, [x10]\n"
+
+        // Load constants.
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "dup v26.8h, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w13\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
+        "dup v28.8h, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "dup v29.4s, w13\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.8b, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+        "dup v31.8b, w13\n"
+        "dup v25.8h, w12\n"
+
+        // Add input and filter offsets.
+        "uaddw v8.8h, v26.8h, v8.8b\n"
+        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v9.8h, v26.8h, v9.8b\n"
+        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v10.8h, v26.8h, v10.8b\n"
+        "uaddw v11.8h, v26.8h, v11.8b\n"
+        "uaddw v12.8h, v26.8h, v12.8b\n"
+        "uaddw v13.8h, v26.8h, v13.8b\n"
+
+        "uaddw v0.8h, v25.8h, v0.8b\n"
+        "uaddw v1.8h, v25.8h, v1.8b\n"
+        "uaddw v2.8h, v25.8h, v2.8b\n"
+        "uaddw v3.8h, v25.8h, v3.8b\n"
+        "uaddw v4.8h, v25.8h, v4.8b\n"
+        "uaddw v5.8h, v25.8h, v5.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "mov x12, %[input_ptr]\n"
+          "subs x15, x15, #8\n"
+          "add x13, x12, x11\n"
+          "cmp x15, #16\n"
+          "add %[input_ptr], %[input_ptr], #8\n"
+
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "mov x9, %[filter_ptr]\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [x12], x7\n"
+          "smlal v16.4s, v1.4h, v9.4h\n"
+          "add x10, x9, x14\n"
+          "smlal2 v17.4s, v1.8h, v9.8h\n"
+          "ld1 {v9.8b}, [x12], x7\n"
+          "smlal v16.4s, v2.4h, v10.4h\n"
+          "add %[filter_ptr], %[filter_ptr], #8\n"
+          "smlal2 v17.4s, v2.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x12]\n"
+          "smlal v16.4s, v3.4h, v11.4h\n"
+          "ld1 {v0.8b}, [x9], x7\n"
+          "smlal2 v17.4s, v3.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x13], x7\n"
+          "smlal v16.4s, v4.4h, v12.4h\n"
+          "ld1 {v1.8b}, [x9], x7\n"
+          "smlal2 v17.4s, v4.8h, v12.8h\n"
+          "ld1 {v12.8b}, [x13], x7\n"
+          "smlal v16.4s, v5.4h, v13.4h\n"
+          "ld1 {v2.8b}, [x9]\n"
+          "smlal2 v17.4s, v5.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x13]\n"
+
+          "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+          "ld1 {v3.8b}, [x10], x7\n"
+          "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+          "ld1 {v4.8b}, [x10], x7\n"
+          "sqrshl v16.4s, v16.4s, v29.4s\n"
+          "ld1 {v5.8b}, [x10]\n"
+          "sqrshl v17.4s, v17.4s, v29.4s\n"
+          "sqxtn v16.4h, v16.4s\n"
+          "sqxtn2 v16.8h, v17.4s\n"
+          "sqadd v16.8h, v16.8h, v28.8h\n"
+          "sqxtun v16.8b, v16.8h\n"
+          "umax v16.8b, v16.8b, v30.8b\n"
+          "umin v16.8b, v16.8b, v31.8b\n"
+          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "st1 {v16.8b}, [%[output_ptr]], #8\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "uaddw v10.8h, v26.8h, v10.8b\n"
+          "uaddw v11.8h, v26.8h, v11.8b\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "uaddw v13.8h, v26.8h, v13.8b\n"
+
+          "uaddw v0.8h, v25.8h, v0.8b\n"
+          "uaddw v1.8h, v25.8h, v1.8b\n"
+          "uaddw v2.8h, v25.8h, v2.8b\n"
+          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v3.8h, v25.8h, v3.8b\n"
+          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v4.8h, v25.8h, v4.8b\n"
+          "uaddw v5.8h, v25.8h, v5.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+        "smlal v16.4s, v1.4h, v9.4h\n"
+        "smlal2 v17.4s, v1.8h, v9.8h\n"
+        "smlal v16.4s, v2.4h, v10.4h\n"
+        "smlal2 v17.4s, v2.8h, v10.8h\n"
+        "smlal v16.4s, v3.4h, v11.4h\n"
+        "smlal2 v17.4s, v3.8h, v11.8h\n"
+        "smlal v16.4s, v4.4h, v12.4h\n"
+        "smlal2 v17.4s, v4.8h, v12.8h\n"
+        "smlal v16.4s, v5.4h, v13.4h\n"
+        "smlal2 v17.4s, v5.8h, v13.8h\n"
+
+        "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+        "sqrshl v16.4s, v16.4s, v29.4s\n"
+        "sqrshl v17.4s, v17.4s, v29.4s\n"
+        "sqxtn v16.4h, v16.4s\n"
+        "sqxtn2 v16.8h, v17.4s\n"
+        "sqadd v16.8h, v16.8h, v28.8h\n"
+        "sqxtun v16.8b, v16.8h\n"
+        "umax v16.8b, v16.8b, v30.8b\n"
+        "umin v16.8b, v16.8b, v31.8b\n"
+        "st1 {v16.8b}, [%[output_ptr]]\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
+        :
+        // Inputs.
+        [params_ptr] "r"(params_ptr)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v12",
+        "v13", "v16", "v17", "v18", "v19", "v25", "v26", "v27", "v28", "v29",
+        "v30", "v31",
+        // We use these general-purpose registers.
+        "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
+  }
+};
+template <>
+struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
+                                      EdgeType::kVertical, 1, 1> {
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         const DepthwiseConvParams* params_ptr) {
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 3x2 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the beginning of the 3x2 input and
+        // filter values.
+
+        // Load input and filter values.
+        "ldr x6, [%[params_ptr], #" STR(OFFSET_INPUT_DEPTH) "]\n"
+        "mov x12, %[input_ptr]\n"
+        "ldr x11, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
+        "mov x7, %[filter_ptr]\n"
+        "ldr x5, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
+        "add x13, x12, x11\n"
+        "ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "add x14, x13, x11\n"
+
+        "ld1 {v8.8b}, [x12], x6\n"
+        "add x9, x7, x5\n"
+        "ld1 {v9.8b}, [x12]\n"
+        "cmp x15, #16\n"
+        "add x10, x9, x5\n"
+        "ld1 {v10.8b}, [x13], x6\n"
+        "add %[input_ptr], %[input_ptr], #8\n"
+        "ld1 {v11.8b}, [x13]\n"
+        "add %[filter_ptr], %[filter_ptr], #8\n"
+        "ld1 {v12.8b}, [x14], x6\n"
+        "ld1 {v13.8b}, [x14]\n"
+
+        "ld1 {v0.8b}, [x7], x6\n"
+        "ld1 {v1.8b}, [x7]\n"
+        "ld1 {v2.8b}, [x9], x6\n"
+        "ld1 {v3.8b}, [x9]\n"
+        "ld1 {v4.8b}, [x10], x6\n"
+        "ld1 {v5.8b}, [x10]\n"
+
+        // Load constants.
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "dup v26.8h, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w13\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
+        "dup v28.8h, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "dup v29.4s, w13\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.8b, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+        "dup v31.8b, w13\n"
+        "dup v25.8h, w12\n"
+
+        // Add input and filter offsets.
+        "uaddw v8.8h, v26.8h, v8.8b\n"
+        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v9.8h, v26.8h, v9.8b\n"
+        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v10.8h, v26.8h, v10.8b\n"
+        "uaddw v11.8h, v26.8h, v11.8b\n"
+        "uaddw v12.8h, v26.8h, v12.8b\n"
+        "uaddw v13.8h, v26.8h, v13.8b\n"
+
+        "uaddw v0.8h, v25.8h, v0.8b\n"
+        "uaddw v1.8h, v25.8h, v1.8b\n"
+        "uaddw v2.8h, v25.8h, v2.8b\n"
+        "uaddw v3.8h, v25.8h, v3.8b\n"
+        "uaddw v4.8h, v25.8h, v4.8b\n"
+        "uaddw v5.8h, v25.8h, v5.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "mov x12, %[input_ptr]\n"
+          "subs x15, x15, #8\n"
+          "add x13, x12, x11\n"
+          "cmp x15, #16\n"
+          "add x14, x13, x11\n"
+          "add %[input_ptr], %[input_ptr], #8\n"
+
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "mov x7, %[filter_ptr]\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [x12], x6\n"
+          "smlal v16.4s, v1.4h, v9.4h\n"
+          "add x9, x7, x5\n"
+          "smlal2 v17.4s, v1.8h, v9.8h\n"
+          "add x10, x9, x5\n"
+          "ld1 {v9.8b}, [x12]\n"
+          "smlal v16.4s, v2.4h, v10.4h\n"
+          "add %[filter_ptr], %[filter_ptr], #8\n"
+          "smlal2 v17.4s, v2.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x13], x6\n"
+          "smlal v16.4s, v3.4h, v11.4h\n"
+          "ld1 {v0.8b}, [x7], x6\n"
+          "smlal2 v17.4s, v3.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x13]\n"
+          "smlal v16.4s, v4.4h, v12.4h\n"
+          "ld1 {v1.8b}, [x7]\n"
+          "smlal2 v17.4s, v4.8h, v12.8h\n"
+          "ld1 {v12.8b}, [x14], x6\n"
+          "smlal v16.4s, v5.4h, v13.4h\n"
+          "ld1 {v2.8b}, [x9], x6\n"
+          "smlal2 v17.4s, v5.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x14]\n"
+
+          "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+          "ld1 {v3.8b}, [x9]\n"
+          "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+          "ld1 {v4.8b}, [x10], x6\n"
+          "sqrshl v16.4s, v16.4s, v29.4s\n"
+          "ld1 {v5.8b}, [x10]\n"
+          "sqrshl v17.4s, v17.4s, v29.4s\n"
+          "sqxtn v16.4h, v16.4s\n"
+          "sqxtn2 v16.8h, v17.4s\n"
+          "sqadd v16.8h, v16.8h, v28.8h\n"
+          "sqxtun v16.8b, v16.8h\n"
+          "umax v16.8b, v16.8b, v30.8b\n"
+          "umin v16.8b, v16.8b, v31.8b\n"
+          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "st1 {v16.8b}, [%[output_ptr]], #8\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "uaddw v10.8h, v26.8h, v10.8b\n"
+          "uaddw v11.8h, v26.8h, v11.8b\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "uaddw v13.8h, v26.8h, v13.8b\n"
+
+          "uaddw v0.8h, v25.8h, v0.8b\n"
+          "uaddw v1.8h, v25.8h, v1.8b\n"
+          "uaddw v2.8h, v25.8h, v2.8b\n"
+          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v3.8h, v25.8h, v3.8b\n"
+          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v4.8h, v25.8h, v4.8b\n"
+          "uaddw v5.8h, v25.8h, v5.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+        "smlal v16.4s, v1.4h, v9.4h\n"
+        "smlal2 v17.4s, v1.8h, v9.8h\n"
+        "smlal v16.4s, v2.4h, v10.4h\n"
+        "smlal2 v17.4s, v2.8h, v10.8h\n"
+        "smlal v16.4s, v3.4h, v11.4h\n"
+        "smlal2 v17.4s, v3.8h, v11.8h\n"
+        "smlal v16.4s, v4.4h, v12.4h\n"
+        "smlal2 v17.4s, v4.8h, v12.8h\n"
+        "smlal v16.4s, v5.4h, v13.4h\n"
+        "smlal2 v17.4s, v5.8h, v13.8h\n"
+
+        "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+        "sqrshl v16.4s, v16.4s, v29.4s\n"
+        "sqrshl v17.4s, v17.4s, v29.4s\n"
+        "sqxtn v16.4h, v16.4s\n"
+        "sqxtn2 v16.8h, v17.4s\n"
+        "sqadd v16.8h, v16.8h, v28.8h\n"
+        "sqxtun v16.8b, v16.8h\n"
+        // TODO(b/129852264): Improve testing coverage.
+        "umax v16.8b, v16.8b, v30.8b\n"
+        "umin v16.8b, v16.8b, v31.8b\n"
+        "st1 {v16.8b}, [%[output_ptr]]\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
+        :
+        // Inputs.
+        [params_ptr] "r"(params_ptr)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v12",
+        "v13", "v16", "v17", "v18", "v19", "v25", "v26", "v27", "v28", "v29",
+        "v30", "v31",
+        // We use these general-purpose registers.
+        "x5", "x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
+  }
+};
+
+#undef OFFSET_INPUT_DEPTH
+#undef OFFSET_INPUT_ROW_SIZE
+#undef OFFSET_OUTPUT_DEPTH
+#undef OFFSET_OUTPUT_ROW_SIZE
+#undef OFFSET_INPUT_OFFSET
+#undef OFFSET_OUTPUT_OFFSET
+#undef OFFSET_FILTER_OFFSET
+#undef OFFSET_OUTPUT_MULTIPLIER
+#undef OFFSET_OUTPUT_ACTIVATION_MIN
+#undef OFFSET_OUTPUT_ACTIVATION_MAX
+#undef OFFSET_OUTPUT_RIGHT_SHIFT
+#undef OFFSET_INPUT_WIDTH
+#undef OFFSET_INPUT_HEIGHT
+#undef OFFSET_OUTPUT_WIDTH
+#undef OFFSET_OUTPUT_HEIGHT
+
+template <DepthwiseConvOutputRounding output_rounding, int32 kStrideWidth,
+          int32 kStrideHeight>
+struct DepthwiseConvThroughDepthPerChannel {
+  // Runs the DepthwiseConvWindowPerChannel kernels through the depth dimension
+  // from |start_depth| to |end_depth|. Keep this not inlined to maintain a
+  // small binary size. We use a DepthwiseConvParams struct for read only params
+  // to minimize call overhead.
+  static void __attribute__((noinline))
+  Run(const uint8* input_ptr, const uint8* filter_ptr, const int32* bias_ptr,
+      uint8* output_ptr, int64_t start_depth, int64_t end_depth,
+      int64_t input_depth, int64_t input_row_size, int32 output_window_height,
+      int32 output_window_width, const DepthwiseConvParams& params) {
+    for (; start_depth <= end_depth - 8; start_depth += 8) {
+      DepthwiseConvWindowPerChannel<output_rounding, 8, kStrideWidth,
+                                    kStrideHeight>::Run(input_ptr, filter_ptr,
+                                                        bias_ptr, output_ptr,
+                                                        input_depth,
+                                                        input_row_size,
+                                                        output_window_height,
+                                                        output_window_width,
+                                                        &params);
+      input_ptr += 8;
+      output_ptr += 8;
+      filter_ptr += 8;
+      bias_ptr += 8;
+    }
+  }
+};
+
+template <DepthwiseConvOutputRounding output_rounding, int32 kStrideWidth,
+          int32 kStrideHeight>
+struct DepthwiseConvMultiRowPerChannel {
+  using ConvKernel =
+      DepthwiseConvThroughDepthPerChannel<output_rounding, kStrideWidth,
+                                          kStrideHeight>;
+
+  static inline void Run(const uint8* input_data, int32 start_x, int32 end_x,
+                         const uint8* filter_data, const int32* bias_data,
+                         uint8* output_data, const DepthwiseConvParams& params,
+                         const ShuffleParams& shuffle_params,
+                         uint8* shuffle_workspace) {
+    TFLITE_DCHECK(
+        shuffle_params.input_height ==
+        get_shuffle_input_size(kStrideHeight, shuffle_params.output_height));
+    TFLITE_DCHECK(
+        shuffle_params.input_width ==
+        get_shuffle_input_size(kStrideWidth, shuffle_params.output_width));
+    TFLITE_DCHECK_LE(
+        64 * shuffle_params.input_width * shuffle_params.input_height,
+        kDepthwiseConvScratchWorkspaceSize);
+
+    int32 out_x = start_x;
+
+    // Run shuffling on inputs with sufficiently large depth and width. When
+    // these parameters are large enough, more time is taken to load inputs
+    // from memory. At this point, it becomes useful to prefetch and
+    // preshuffle the input data to maximize locality.
+    if (params.output_depth > 64 ||
+        (params.output_depth <= 64 && params.input_width > 150)) {
+      for (; out_x <= (end_x - shuffle_params.output_width);
+           out_x += shuffle_params.output_width) {
+        const uint8* input_ptr = input_data;
+        const int32* bias_ptr = bias_data;
+        const uint8* filter_ptr = filter_data;
+        uint8* output_ptr = output_data;
+        int64_t depth = 0;
+        const int64_t shuffle_row_size = 64 * shuffle_params.input_width;
+
+        for (; depth <= params.output_depth - 64; depth += 64) {
+          // Preload.
+          const uint8* h_ptr = input_ptr;
+          for (int32 i = 0; i < shuffle_params.input_height; i++) {
+            const uint8* ptr = h_ptr;
+            for (int32 j = 0; j < shuffle_params.input_width; j++) {
+              asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
+              ptr += params.input_depth;
+            }
+            h_ptr += params.input_row_size;
+          }
+
+          // For a large enough input, shuffle into buckets.
+          ShuffleInput(input_ptr, params.input_depth, params.input_width,
+                       params.input_height, 64, shuffle_params.input_width,
+                       shuffle_params.input_height, shuffle_workspace);
+          ConvKernel::Run(shuffle_workspace, filter_ptr, bias_ptr, output_ptr,
+                          0, 64, 64, shuffle_row_size,
+                          shuffle_params.output_height,
+                          shuffle_params.output_width, params);
+          input_ptr += 64;
+          output_ptr += 64;
+          filter_ptr += 64;
+          bias_ptr += 64;
+        }
+
+        // Preload.
+        const uint8* h_ptr = input_ptr;
+        for (int32 i = 0; i < shuffle_params.input_height; i++) {
+          const uint8* ptr = h_ptr;
+          for (int32 j = 0; j < shuffle_params.input_width; j++) {
+            asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
+            ptr += params.input_depth;
+          }
+          h_ptr += params.input_row_size;
+        }
+
+        // Handle leftover depth.
+        ConvKernel::Run(input_ptr, filter_ptr, bias_ptr, output_ptr, depth,
+                        params.output_depth, params.input_depth,
+                        params.input_row_size, shuffle_params.output_height,
+                        shuffle_params.output_width, params);
+
+        input_data +=
+            shuffle_params.output_width * kStrideWidth * params.input_depth;
+        output_data += shuffle_params.output_width * params.output_depth;
+      }
+    }
+
+    const int32 output_leftover_width = end_x - out_x;
+    if (output_leftover_width > 0) {
+      ConvKernel::Run(input_data, filter_data, bias_data, output_data, 0,
+                      params.output_depth, params.input_depth,
+                      params.input_row_size, shuffle_params.output_height,
+                      output_leftover_width, params);
+    }
+  }
+};
+
+// Processes the borders of the input for pad_width and pad_height = 1.
+// Calls 4 asm kernels:
+//   * 1x1 input shape.
+//   * Corner edges.
+//   * Horizontal edges.
+//   * Vertical edges.
+template <DepthwiseConvOutputRounding output_rounding>
+inline void DepthwiseConvHandlePaddingPerChannel(
+    const uint8* input_data, const uint8* filter_data, const int32* bias_data,
+    uint8* output_data, const DepthwiseConvParams& params) {
+  if (params.input_width == 1 && params.input_height == 1) {
+    const uint8* filter_ptr =
+        filter_data + params.filter_row_size + params.output_depth;
+    DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kCenter, 1,
+                                   1>::Run(input_data, filter_ptr, bias_data,
+                                           output_data, &params);
+    return;
+  }
+
+  const int32 out_x_start_corner = 0;
+  const int32 out_x_end_corner = params.output_width - 1;
+  const int32 out_y_start_corner = 0;
+  const int32 out_y_end_corner = params.output_height - 1;
+
+  // Handle top row.
+  const uint8* input_ptr = input_data;
+  const uint8* filter_ptr =
+      filter_data + params.filter_row_size + params.output_depth;
+  uint8* output_ptr = output_data;
+
+  DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kCorner, 1, 1>::Run(
+      input_ptr, filter_ptr, bias_data, output_ptr, &params);
+
+  input_ptr += (params.stride_width - 1) * params.input_depth;
+  filter_ptr = filter_data + params.filter_row_size;
+  output_ptr += params.output_depth;
+
+  for (int32 out_x = out_x_start_corner + 1; out_x < out_x_end_corner;
+       out_x++) {
+    DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kHorizontal, 1,
+                                   1>::Run(input_ptr, filter_ptr, bias_data,
+                                           output_ptr, &params);
+    input_ptr += params.stride_width * params.input_depth;
+    output_ptr += params.output_depth;
+  }
+
+  DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kCorner, 1, 1>::Run(
+      input_ptr, filter_ptr, bias_data, output_ptr, &params);
+
+  // Handle left side.
+  input_ptr = input_data + (params.stride_width - 1) * params.input_row_size;
+  filter_ptr = filter_data + params.input_depth;
+  output_ptr = output_data + params.output_row_size;
+
+  for (int32 out_y = out_y_start_corner + 1; out_y < out_y_end_corner;
+       out_y++) {
+    DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kVertical, 1,
+                                   1>::Run(input_ptr, filter_ptr, bias_data,
+                                           output_ptr, &params);
+    input_ptr += params.stride_width * params.input_row_size;
+    output_ptr += params.output_row_size;
+  }
+
+  // Handle right side.
+  input_ptr = input_data + (params.input_width - 2) * params.input_depth +
+              (params.stride_width - 1) * params.input_row_size;
+  filter_ptr = filter_data;
+  output_ptr = output_data + params.output_row_size +
+               (params.output_width - 1) * params.output_depth;
+
+  for (int32 out_y = out_y_start_corner + 1; out_y < out_y_end_corner;
+       out_y++) {
+    DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kVertical, 1,
+                                   1>::Run(input_ptr, filter_ptr, bias_data,
+                                           output_ptr, &params);
+    input_ptr += params.stride_width * params.input_row_size;
+    output_ptr += params.output_row_size;
+  }
+
+  // Handle bottom row.
+  input_ptr = input_data + (params.input_height - 2) * params.input_row_size;
+  filter_ptr = filter_data + params.output_depth;
+  output_ptr =
+      output_data + (params.output_height - 1) * params.output_row_size;
+
+  DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kCorner, 1, 1>::Run(
+      input_ptr, filter_ptr, bias_data, output_ptr, &params);
+
+  input_ptr += (params.stride_width == 1) ? 0 : params.input_depth;
+  filter_ptr = filter_data;
+  output_ptr += params.output_depth;
+
+  for (int32 out_x = out_x_start_corner + 1; out_x < out_x_end_corner;
+       out_x++) {
+    DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kHorizontal, 1,
+                                   1>::Run(input_ptr, filter_ptr, bias_data,
+                                           output_ptr, &params);
+    input_ptr += params.stride_width * params.input_depth;
+    output_ptr += params.output_depth;
+  }
+
+  DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kCorner, 1, 1>::Run(
+      input_ptr, filter_ptr, bias_data, output_ptr, &params);
+}
+
+template <DepthwiseConvOutputRounding output_rounding>
+inline void DepthwiseConv3x3FilterPerChannel(
+    const DepthwiseParams& rt_params, const RuntimeShape& input_shape,
+    const uint8* input_data, const RuntimeShape& filter_shape,
+    const uint8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    uint8* output_data, int thread_start, int thread_end, int thread_dim) {
+  DepthwiseConvParams params;
+
+  const int32 stride_width = rt_params.stride_width;
+  const int32 stride_height = rt_params.stride_height;
+  const int32 pad_width = rt_params.padding_values.width;
+  const int32 pad_height = rt_params.padding_values.height;
+  const int32 depth_multiplier = rt_params.depth_multiplier;
+  const int32 output_activation_min = rt_params.quantized_activation_min;
+  const int32 output_activation_max = rt_params.quantized_activation_max;
+  const int32 input_offset = rt_params.input_offset;
+  const int32 filter_offset = rt_params.weights_offset;
+  const int32 output_offset = rt_params.output_offset;
+  const int32 output_multiplier = rt_params.output_multiplier;
+  const int32 output_shift = rt_params.output_shift;
+
+  params.input_depth = input_shape.Dims(3);
+  params.input_width = input_shape.Dims(2);
+  params.input_height = input_shape.Dims(1);
+  params.input_row_size = params.input_depth * params.input_width;
+  params.input_offset = input_offset;
+  params.stride_width = stride_width;
+  params.stride_height = stride_height;
+  params.output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  params.output_width = output_shape.Dims(2);
+  params.output_height = output_shape.Dims(1);
+  params.output_row_size = params.output_depth * params.output_width;
+  params.output_offset = output_offset;
+  params.filter_offset = filter_offset;
+  params.output_multiplier = output_multiplier;
+  params.output_right_shift = output_shift;
+  params.output_activation_min = output_activation_min;
+  params.output_activation_max = output_activation_max;
+
+  const int32 filter_height = filter_shape.Dims(1);
+  const int32 filter_width = filter_shape.Dims(2);
+  params.filter_row_size = params.output_depth * filter_width;
+
+  // Algorithm assumes below constraints. It is optimized for depth
+  // multiplier of 1, 3x3 filter, no padding and strides 1 and 2.
+  TFLITE_DCHECK(params.output_depth == params.input_depth * depth_multiplier);
+  TFLITE_DCHECK(depth_multiplier == 1);
+  TFLITE_DCHECK(filter_height == 3);
+  TFLITE_DCHECK(filter_width == 3);
+  TFLITE_DCHECK(stride_height == 1 || stride_height == 2);
+  TFLITE_DCHECK(stride_width == 1 || stride_width == 2);
+  TFLITE_DCHECK(stride_width == stride_height);
+  TFLITE_DCHECK(pad_height == 0 || pad_height == 1);
+  TFLITE_DCHECK(pad_width == 0 || pad_width == 1);
+  TFLITE_DCHECK(pad_width == pad_height);
+  TFLITE_DCHECK(thread_dim == 0 || thread_dim == 1);
+
+  const int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int64_t input_batch_size = params.input_row_size * params.input_height;
+  const int64_t output_batch_size =
+      params.output_row_size * params.output_height;
+
+  ShuffleParams one_row_shuffle_params, two_row_shuffle_params,
+      four_row_shuffle_params, eight_row_shuffle_params;
+  if (stride_width == 1) {
+    one_row_shuffle_params = ShuffleParams(30, 1, 1, 1);
+    two_row_shuffle_params = ShuffleParams(22, 2, 1, 1);
+    four_row_shuffle_params = ShuffleParams(14, 4, 1, 1);
+    eight_row_shuffle_params = ShuffleParams(8, 8, 1, 1);
+  } else {
+    one_row_shuffle_params = ShuffleParams(14, 1, 2, 2);
+    two_row_shuffle_params = ShuffleParams(8, 2, 2, 2);
+    four_row_shuffle_params = ShuffleParams(4, 4, 2, 2);
+    eight_row_shuffle_params = ShuffleParams(2, 8, 2, 2);
+  }
+
+  using conv_multirow_func_t =
+      decltype(&DepthwiseConvMultiRowPerChannel<output_rounding, 1, 1>::Run);
+  conv_multirow_func_t conv_multirow_func =
+      DepthwiseConvMultiRowPerChannel<output_rounding, 1, 1>::Run;
+  if (stride_width == 2) {
+    conv_multirow_func =
+        DepthwiseConvMultiRowPerChannel<output_rounding, 2, 2>::Run;
+  }
+
+  // Allocate maximum memory needed for shuffled input.
+  // TODO(mariewhite): The size of this workspace is small enough to be
+  // allocated on the stack. Eventually we will want to move it to the heap
+  // and have it allocated outside of this function, like the im2col_array
+  // used in gemmlowp.
+  uint8 shuffle_workspace[kDepthwiseConvScratchWorkspaceSize];
+
+  int batch_start = 0;
+  int batch_end = batches;
+  int row_start = 0;
+  int row_end = params.output_height;
+
+  switch (thread_dim) {
+    case 0:
+      TFLITE_DCHECK_GE(thread_start, 0);
+      TFLITE_DCHECK_LE(thread_end, batches);
+      batch_start = thread_start;
+      batch_end = thread_end;
+      break;
+    case 1:
+      TFLITE_DCHECK_GE(thread_start, 0);
+      TFLITE_DCHECK_LE(thread_end, params.output_height);
+      row_start = thread_start;
+      row_end = thread_end;
+      break;
+  }
+
+  for (int32 b = batch_start; b < batch_end; ++b) {
+    // input_ptr and output_ptr point to the start of each batch
+    const uint8* input_ptr = input_data + b * input_batch_size;
+    uint8* output_ptr = output_data + b * output_batch_size;
+
+    int32 out_x = 0;
+    int32 out_y = row_start;
+    int32 end_x = params.output_width;
+    int32 end_y = row_end;
+
+    if (pad_width == 1 && pad_height == 1) {
+      DepthwiseConvHandlePaddingPerChannel<output_rounding>(
+          input_ptr, filter_data, bias_data, output_ptr, params);
+
+      // Update extents now that the edges have been handled.
+      out_x = 1;
+      end_x = params.output_width - 1;
+      out_y = std::max(1, out_y);
+      end_y = std::min(params.output_height - 1, end_y);
+    }
+
+    // pad_width and pad_height can both be 0 or 1, depending on padding option,
+    // such as Padding_VALID / Padding_SAME.
+    const int in_x = (out_x * stride_width) - pad_width;
+    const int in_y = (out_y * stride_height) - pad_height;
+
+    // input_ptr and output_ptr point to (in_y, in_x) and (out_y, out_x),
+    // respectively. (in_y, in_x) and (out_y, out_x) change along with
+    // row_start.
+    input_ptr += in_y * params.input_row_size + in_x * params.input_depth;
+    output_ptr += out_y * params.output_row_size + out_x * params.output_depth;
+
+    // Shuffling shapes that maximize width over the shuffle workspace size
+    // perform better since the inputs are closer together, minimizing
+    // shuffling time.
+    //
+    // If the input shape has width large enough for the 2 row kernels,
+    // we prefer to use this. The innermost loop of the kernels handle
+    // 2 height x 2 width so this is the fastest path.
+    //
+    // If the input shape has smaller width but larger height, shuffling is
+    // still useful and can benefit from kernels 4 row and 8 row kernels.
+
+    // Handle 8 rows at a time.
+    if (params.input_width < four_row_shuffle_params.input_width) {
+      for (; out_y <= end_y - 8; out_y += 8) {
+        conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data,
+                           output_ptr, params, eight_row_shuffle_params,
+                           shuffle_workspace);
+        input_ptr += 8 * stride_height * params.input_row_size;
+        output_ptr += 8 * params.output_row_size;
+      }
+    }
+
+    // Handle 4 rows at a time.
+    if (params.input_width < two_row_shuffle_params.input_width) {
+      for (; out_y <= end_y - 4; out_y += 4) {
+        conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data,
+                           output_ptr, params, four_row_shuffle_params,
+                           shuffle_workspace);
+        input_ptr += 4 * stride_height * params.input_row_size;
+        output_ptr += 4 * params.output_row_size;
+      }
+    }
+
+    // Handle 2 rows at a time.
+    for (; out_y <= end_y - 2; out_y += 2) {
+      conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data,
+                         output_ptr, params, two_row_shuffle_params,
+                         shuffle_workspace);
+      input_ptr += 2 * stride_height * params.input_row_size;
+      output_ptr += 2 * params.output_row_size;
+    }
+
+    // Handle one row at a time.
+    for (; out_y < end_y; out_y++) {
+      conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data,
+                         output_ptr, params, one_row_shuffle_params,
+                         shuffle_workspace);
+      input_ptr += stride_height * params.input_row_size;
+      output_ptr += params.output_row_size;
+    }
+  }
+}
+#endif  // __aarch64__
+
+#endif
+
+#undef STR
+#undef STR_UNEXPANDED
+
+}  // namespace depthwise_conv
+}  // namespace optimized_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_3X3_FILTER_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/fully_connected.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/fully_connected.h
index 32b1b67fac0..7fb2d8896b5 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/fully_connected.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/fully_connected.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_FULLY_CONNECTED_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_FULLY_CONNECTED_H_
 
-#include "public/gemmlowp.h"
+#include "profiling/instrumentation.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm.h"
 #include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
@@ -25,402 +25,6 @@ limitations under the License.
 namespace tflite {
 namespace optimized_integer_ops {
 
-inline void optimized_ops_preload_l1_stream(const int8_t* ptr) {
-#ifdef GEMMLOWP_ARM_64
-  asm volatile("prfm pldl1strm, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
-#else
-  gemmlowp::Prefetch(ptr);
-#endif
-}
-
-inline void optimized_ops_preload_l1_keep(const int8_t* ptr) {
-#ifdef GEMMLOWP_ARM_64
-  asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
-#else
-  gemmlowp::Prefetch(ptr);
-#endif
-}
-
-#ifdef USE_NEON
-inline void FullyConnectedAsGEMVWorkerImpl(
-    const RuntimeShape& input_shape, const int8_t* input_data,
-    int32 input_offset, const RuntimeShape& filter_shape,
-    const int8_t* filter_data, int32 filter_offset,
-    const RuntimeShape& bias_shape, const int32* bias_data, int32 output_offset,
-    int32 output_multiplier, int output_shift, int32 output_activation_min,
-    int32 output_activation_max, const RuntimeShape& output_shape,
-    int8_t* output_data, int row_start, int row_end) {
-  gemmlowp::ScopedProfilingLabel label("FullyConnectedAsGEMVInt8/8bit");
-  TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
-  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
-  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
-  const int output_dim_count = output_shape.DimensionsCount();
-  TFLITE_DCHECK_EQ(FlatSizeSkipDim(output_shape, output_dim_count - 1), 1);
-  const int input_size = FlatSizeSkipDim(input_shape, 0);
-  static constexpr int kPeel = 4;
-  const bool shift_left = (output_shift > 0);
-  for (int k = 0; k < input_size; k += 64) {
-    optimized_ops_preload_l1_stream(input_data + k);
-  }
-  for (int k = 0; k < kPeel * input_size; k += 64) {
-    optimized_ops_preload_l1_stream(filter_data + k);
-  }
-
-  TFLITE_DCHECK_GE(row_end - row_start, kPeel);
-
-  for (int out = row_start; out < row_end; out += kPeel) {
-    out = std::min(out, row_end - kPeel);
-    int32x4_t acc0 = vdupq_n_s32(0);
-    int32x4_t acc1 = acc0;
-    int32x4_t acc2 = acc0;
-    int32x4_t acc3 = acc0;
-    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
-    const int16x8_t filter_offset_vec = vdupq_n_s16(filter_offset);
-    int in = 0;
-    for (; in <= input_size - 16; in += 16) {
-      const int8x16_t input_val_s8 = vld1q_s8(input_data + in);
-      const int8_t* filter_ptr = filter_data + in + out * input_size;
-      int8x16_t filter_val_s8_0 = vld1q_s8(filter_ptr);
-      optimized_ops_preload_l1_stream(filter_ptr + 64);
-      filter_ptr += input_size;
-      int8x16_t filter_val_s8_1 = vld1q_s8(filter_ptr);
-      optimized_ops_preload_l1_stream(filter_ptr + 64);
-      filter_ptr += input_size;
-      int8x16_t filter_val_s8_2 = vld1q_s8(filter_ptr);
-      optimized_ops_preload_l1_stream(filter_ptr + 64);
-      filter_ptr += input_size;
-      int8x16_t filter_val_s8_3 = vld1q_s8(filter_ptr);
-      optimized_ops_preload_l1_stream(filter_ptr + 64);
-      int16x8_t input_val_0, input_val_1;
-      int8x8_t low = vget_low_s8(input_val_s8);
-      int8x8_t high = vget_high_s8(input_val_s8);
-      input_val_0 = vmovl_s8(low);
-      input_val_1 = vmovl_s8(high);
-      input_val_0 = vaddq_s16(input_val_0, input_offset_vec);
-      input_val_1 = vaddq_s16(input_val_1, input_offset_vec);
-      low = vget_low_s8(filter_val_s8_0);
-      high = vget_high_s8(filter_val_s8_0);
-      int16x8_t filter_val_0_0 = vmovl_s8(low);
-      int16x8_t filter_val_0_1 = vmovl_s8(high);
-      filter_val_0_0 = vaddq_s16(filter_val_0_0, filter_offset_vec);
-      filter_val_0_1 = vaddq_s16(filter_val_0_1, filter_offset_vec);
-      low = vget_low_s8(filter_val_s8_1);
-      high = vget_high_s8(filter_val_s8_1);
-      int16x8_t filter_val_1_0 = vmovl_s8(low);
-      int16x8_t filter_val_1_1 = vmovl_s8(high);
-      filter_val_1_0 = vaddq_s16(filter_val_1_0, filter_offset_vec);
-      filter_val_1_1 = vaddq_s16(filter_val_1_1, filter_offset_vec);
-      low = vget_low_s8(filter_val_s8_2);
-      high = vget_high_s8(filter_val_s8_2);
-      int16x8_t filter_val_2_0 = vmovl_s8(low);
-      int16x8_t filter_val_2_1 = vmovl_s8(high);
-      filter_val_2_0 = vaddq_s16(filter_val_2_0, filter_offset_vec);
-      filter_val_2_1 = vaddq_s16(filter_val_2_1, filter_offset_vec);
-      low = vget_low_s8(filter_val_s8_3);
-      high = vget_high_s8(filter_val_s8_3);
-      int16x8_t filter_val_3_0 = vmovl_s8(low);
-      int16x8_t filter_val_3_1 = vmovl_s8(high);
-      filter_val_3_0 = vaddq_s16(filter_val_3_0, filter_offset_vec);
-      filter_val_3_1 = vaddq_s16(filter_val_3_1, filter_offset_vec);
-      acc0 = vmlal_s16(acc0, vget_low_s16(filter_val_0_0),
-                       vget_low_s16(input_val_0));
-      acc1 = vmlal_s16(acc1, vget_low_s16(filter_val_1_0),
-                       vget_low_s16(input_val_0));
-      acc2 = vmlal_s16(acc2, vget_low_s16(filter_val_2_0),
-                       vget_low_s16(input_val_0));
-      acc3 = vmlal_s16(acc3, vget_low_s16(filter_val_3_0),
-                       vget_low_s16(input_val_0));
-      acc0 = vmlal_s16(acc0, vget_low_s16(filter_val_0_1),
-                       vget_low_s16(input_val_1));
-      acc1 = vmlal_s16(acc1, vget_low_s16(filter_val_1_1),
-                       vget_low_s16(input_val_1));
-      acc2 = vmlal_s16(acc2, vget_low_s16(filter_val_2_1),
-                       vget_low_s16(input_val_1));
-      acc3 = vmlal_s16(acc3, vget_low_s16(filter_val_3_1),
-                       vget_low_s16(input_val_1));
-      acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0_0),
-                       vget_high_s16(input_val_0));
-      acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1_0),
-                       vget_high_s16(input_val_0));
-      acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2_0),
-                       vget_high_s16(input_val_0));
-      acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3_0),
-                       vget_high_s16(input_val_0));
-      acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0_1),
-                       vget_high_s16(input_val_1));
-      acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1_1),
-                       vget_high_s16(input_val_1));
-      acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2_1),
-                       vget_high_s16(input_val_1));
-      acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3_1),
-                       vget_high_s16(input_val_1));
-    }
-    for (; in <= input_size - 8; in += 8) {
-      const int8x8_t input_val_s8 = vld1_s8(input_data + in);
-      const int8_t* filter_ptr = filter_data + in + out * input_size;
-      int8x8_t filter_val_s8_0 = vld1_s8(filter_ptr);
-      filter_ptr += input_size;
-      int8x8_t filter_val_s8_1 = vld1_s8(filter_ptr);
-      filter_ptr += input_size;
-      int8x8_t filter_val_s8_2 = vld1_s8(filter_ptr);
-      filter_ptr += input_size;
-      int8x8_t filter_val_s8_3 = vld1_s8(filter_ptr);
-      int16x8_t input_val = vmovl_s8(input_val_s8);
-      input_val = vaddq_s16(input_val, input_offset_vec);
-      int16x8_t filter_val_0 = vmovl_s8(filter_val_s8_0);
-      filter_val_0 = vaddq_s16(filter_val_0, filter_offset_vec);
-      int16x8_t filter_val_1 = vmovl_s8(filter_val_s8_1);
-      filter_val_1 = vaddq_s16(filter_val_1, filter_offset_vec);
-      int16x8_t filter_val_2 = vmovl_s8(filter_val_s8_2);
-      filter_val_2 = vaddq_s16(filter_val_2, filter_offset_vec);
-      int16x8_t filter_val_3 = vmovl_s8(filter_val_s8_3);
-      filter_val_3 = vaddq_s16(filter_val_3, filter_offset_vec);
-      acc0 =
-          vmlal_s16(acc0, vget_low_s16(filter_val_0), vget_low_s16(input_val));
-      acc1 =
-          vmlal_s16(acc1, vget_low_s16(filter_val_1), vget_low_s16(input_val));
-      acc2 =
-          vmlal_s16(acc2, vget_low_s16(filter_val_2), vget_low_s16(input_val));
-      acc3 =
-          vmlal_s16(acc3, vget_low_s16(filter_val_3), vget_low_s16(input_val));
-      acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0),
-                       vget_high_s16(input_val));
-      acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1),
-                       vget_high_s16(input_val));
-      acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2),
-                       vget_high_s16(input_val));
-      acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3),
-                       vget_high_s16(input_val));
-    }
-    if (in < input_size) {
-      int32 buf[16];
-      vst1q_s32(buf + 0, acc0);
-      vst1q_s32(buf + 4, acc1);
-      vst1q_s32(buf + 8, acc2);
-      vst1q_s32(buf + 12, acc3);
-      for (; in < input_size; in++) {
-        int lane = (in + 8 - input_size) % 4;
-        const int32 input_val = input_data[in] + input_offset;
-        for (int k = 0; k < kPeel; k++) {
-          int32 filter_val =
-              filter_data[in + (out + k) * input_size] + filter_offset;
-          buf[lane + 4 * k] += filter_val * input_val;
-        }
-      }
-      acc0 = vld1q_s32(buf + 0);
-      acc1 = vld1q_s32(buf + 4);
-      acc2 = vld1q_s32(buf + 8);
-      acc3 = vld1q_s32(buf + 12);
-    }
-
-    // Horizontally reduce accumulators
-    int32x2_t pairwise_reduced_acc_0 =
-        vpadd_s32(vget_low_s32(acc0), vget_high_s32(acc0));
-    int32x2_t pairwise_reduced_acc_1 =
-        vpadd_s32(vget_low_s32(acc1), vget_high_s32(acc1));
-    int32x2_t pairwise_reduced_acc_2 =
-        vpadd_s32(vget_low_s32(acc2), vget_high_s32(acc2));
-    int32x2_t pairwise_reduced_acc_3 =
-        vpadd_s32(vget_low_s32(acc3), vget_high_s32(acc3));
-    const int32x2_t reduced_lo =
-        vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
-    const int32x2_t reduced_hi =
-        vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
-    int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
-    // Add bias values.
-    int32x4_t bias_vec = vld1q_s32(bias_data + out);
-    reduced = vaddq_s32(reduced, bias_vec);
-    if (shift_left) {
-      const int32 multiplier_power_of_two = 1 << output_shift;
-      reduced = vmulq_n_s32(reduced, multiplier_power_of_two);
-      reduced = vqrdmulhq_n_s32(reduced, output_multiplier);
-    } else {
-      // Multiply by the fixed-point multiplier.
-      reduced = vqrdmulhq_n_s32(reduced, output_multiplier);
-      // Rounding-shift-right.
-      using gemmlowp::RoundingDivideByPOT;
-      reduced = RoundingDivideByPOT(reduced, -output_shift);
-    }
-    // Add the output offset.
-    const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
-    reduced = vaddq_s32(reduced, output_offset_vec);
-    // Narrow values down to 16 bit signed.
-    const int16x4_t res16 = vqmovn_s32(reduced);
-    // Narrow values down to 8 bit signed, saturating.
-    int8x8_t res8 = vqmovn_s16(vcombine_s16(res16, res16));
-    // Apply the clamping from the activation function
-    res8 = vmax_s8(res8, vdup_n_s8(output_activation_min));
-    res8 = vmin_s8(res8, vdup_n_s8(output_activation_max));
-    // Store results to destination.
-    vst1_lane_s8(output_data + out + 0, res8, 0);
-    vst1_lane_s8(output_data + out + 1, res8, 1);
-    vst1_lane_s8(output_data + out + 2, res8, 2);
-    vst1_lane_s8(output_data + out + 3, res8, 3);
-  }
-}
-
-struct FullyConnectedAsGEMVWorkerTask : public cpu_backend_threadpool::Task {
-  FullyConnectedAsGEMVWorkerTask(
-      const RuntimeShape& input_shape, const int8_t* input_data,
-      int32 input_offset, const RuntimeShape& filter_shape,
-      const int8_t* filter_data, int32 filter_offset,
-      const RuntimeShape& bias_shape, const int32* bias_data,
-      int32 output_offset, int32 output_multiplier, int output_shift,
-      int32 output_activation_min, int32 output_activation_max,
-      const RuntimeShape& output_shape, int8_t* output_data, int row_start,
-      int row_end)
-      : input_shape_(input_shape),
-        input_data_(input_data),
-        input_offset_(input_offset),
-        filter_shape_(filter_shape),
-        filter_data_(filter_data),
-        filter_offset_(filter_offset),
-        bias_shape_(bias_shape),
-        bias_data_(bias_data),
-        output_offset_(output_offset),
-        output_multiplier_(output_multiplier),
-        output_shift_(output_shift),
-        output_activation_min_(output_activation_min),
-        output_activation_max_(output_activation_max),
-        output_shape_(output_shape),
-        output_data_(output_data),
-        row_start_(row_start),
-        row_end_(row_end) {}
-
-  void Run() override {
-    FullyConnectedAsGEMVWorkerImpl(
-        input_shape_, input_data_, input_offset_, filter_shape_, filter_data_,
-        filter_offset_, bias_shape_, bias_data_, output_offset_,
-        output_multiplier_, output_shift_, output_activation_min_,
-        output_activation_max_, output_shape_, output_data_, row_start_,
-        row_end_);
-  }
-
-  const RuntimeShape& input_shape_;
-  const int8_t* input_data_;
-  int32 input_offset_;
-  const RuntimeShape& filter_shape_;
-  const int8_t* filter_data_;
-  int32 filter_offset_;
-  const RuntimeShape& bias_shape_;
-  const int32* bias_data_;
-  int32 output_offset_;
-  int32 output_multiplier_;
-  int output_shift_;
-  int32 output_activation_min_;
-  int32 output_activation_max_;
-  const RuntimeShape& output_shape_;
-  int8_t* output_data_;
-  int row_start_;
-  int row_end_;
-};
-
-inline void FullyConnectedAsGEMV(
-    const RuntimeShape& input_shape, const int8_t* input_data,
-    int32 input_offset, const RuntimeShape& filter_shape,
-    const int8_t* filter_data, int32 filter_offset,
-    const RuntimeShape& bias_shape, const int32* bias_data, int32 output_offset,
-    int32 output_multiplier, int output_shift, int32 output_activation_min,
-    int32 output_activation_max, const RuntimeShape& output_shape,
-    int8_t* output_data, CpuBackendContext* cpu_backend_context) {
-  const int output_dim_count = output_shape.DimensionsCount();
-  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
-  const int output_rows = output_shape.Dims(output_dim_count - 1);
-  const int input_size = FlatSizeSkipDim(input_shape, 0);
-  static constexpr int kKernelRows = 4;
-  const int thread_count = gemmlowp::HowManyThreads<kKernelRows>(
-      cpu_backend_context->max_num_threads(), output_rows, batches, input_size);
-  if (thread_count == 1) {
-    // Single-thread case: do the computation on the current thread, don't
-    // use a threadpool
-    FullyConnectedAsGEMVWorkerImpl(
-        input_shape, input_data, input_offset, filter_shape, filter_data,
-        filter_offset, bias_shape, bias_data, output_offset, output_multiplier,
-        output_shift, output_activation_min, output_activation_max,
-        output_shape, output_data, 0, output_rows);
-    return;
-  }
-
-  // Multi-threaded case: use the gemmlowp context's threadpool.
-  TFLITE_DCHECK_GT(thread_count, 1);
-  std::vector<FullyConnectedAsGEMVWorkerTask> tasks;
-  // TODO(b/131746020) don't create new heap allocations every time.
-  // At least we make it a single heap allocation by using reserve().
-  tasks.reserve(thread_count);
-  const int kRowsPerWorker = gemmlowp::RoundUp<kKernelRows>(
-      gemmlowp::CeilQuotient(output_rows, thread_count));
-  int row_start = 0;
-  for (int i = 0; i < thread_count; ++i) {
-    int row_end = std::min(output_rows, row_start + kRowsPerWorker);
-    tasks.emplace_back(input_shape, input_data, input_offset, filter_shape,
-                       filter_data, filter_offset, bias_shape, bias_data,
-                       output_offset, output_multiplier, output_shift,
-                       output_activation_min, output_activation_max,
-                       output_shape, output_data, row_start, row_end);
-    row_start = row_end;
-  }
-  TFLITE_DCHECK_EQ(row_start, output_rows);
-  cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
-                                  cpu_backend_context);
-}
-#endif  // USE_NEON
-
-struct GemmlowpOutputPipeline {
-  typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col>
-      ColVectorMap;
-  typedef std::tuple<gemmlowp::OutputStageBiasAddition<ColVectorMap>,
-                     gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent,
-                     gemmlowp::OutputStageClamp,
-                     gemmlowp::OutputStageSaturatingCastToInt8>
-      Pipeline;
-  static Pipeline MakeExp(const int32* bias_data, int output_rows,
-                          int32 output_offset, int32 output_multiplier,
-                          int output_left_shift, int32 output_activation_min,
-                          int32 output_activation_max) {
-    ColVectorMap bias_vector(bias_data, output_rows);
-    gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
-    bias_addition_stage.bias_vector = bias_vector;
-    gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent quantize_down_stage;
-    quantize_down_stage.result_offset_after_shift = output_offset;
-    quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
-    quantize_down_stage.result_exponent = output_left_shift;
-    gemmlowp::OutputStageClamp clamp_stage;
-    clamp_stage.min = output_activation_min;
-    clamp_stage.max = output_activation_max;
-    gemmlowp::OutputStageSaturatingCastToInt8 saturating_cast_stage;
-    return std::make_tuple(bias_addition_stage, quantize_down_stage,
-                           clamp_stage, saturating_cast_stage);
-  }
-};
-
-struct GemmlowpOutputPipelineInt8 {
-  typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col>
-      ColVectorMap;
-  typedef std::tuple<gemmlowp::OutputStageBiasAddition<ColVectorMap>,
-                     gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent,
-                     gemmlowp::OutputStageClamp,
-                     gemmlowp::OutputStageSaturatingCastToInt8>
-      Pipeline;
-  static Pipeline MakeExp(const int32* bias_data, int output_rows,
-                          int32 output_offset, int32 output_multiplier,
-                          int output_left_shift, int32 output_activation_min,
-                          int32 output_activation_max) {
-    ColVectorMap bias_vector(bias_data, output_rows);
-    gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
-    bias_addition_stage.bias_vector = bias_vector;
-    gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent quantize_down_stage;
-    quantize_down_stage.result_offset_after_shift = output_offset;
-    quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
-    quantize_down_stage.result_exponent = output_left_shift;
-    gemmlowp::OutputStageClamp clamp_stage;
-    clamp_stage.min = output_activation_min;
-    clamp_stage.max = output_activation_max;
-    gemmlowp::OutputStageSaturatingCastToInt8 saturating_cast_stage;
-    return std::make_tuple(bias_addition_stage, quantize_down_stage,
-                           clamp_stage, saturating_cast_stage);
-  }
-};
-
 inline void FullyConnected(
     const FullyConnectedParams& params, const RuntimeShape& input_shape,
     const int8* input_data, const RuntimeShape& filter_shape,
@@ -446,22 +50,6 @@ inline void FullyConnected(
   const int output_dim_count = output_shape.DimensionsCount();
   const int filter_dim_count = filter_shape.DimensionsCount();
   const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
-
-#ifdef USE_NEON
-  if (batches == 1) {
-    const int output_size = MatchingDim(filter_shape, filter_dim_count - 2,
-                                        output_shape, output_dim_count - 1);
-    if (output_size >= 4) {
-      return FullyConnectedAsGEMV(
-          input_shape, input_data, input_offset, filter_shape, filter_data,
-          filter_offset, bias_shape, bias_data, output_offset,
-          output_multiplier, output_shift, output_activation_min,
-          output_activation_max, output_shape, output_data,
-          cpu_backend_context);
-    }
-  }
-#endif  // USE_NEON
-
   const int filter_rows = filter_shape.Dims(filter_dim_count - 2);
   const int filter_cols = filter_shape.Dims(filter_dim_count - 1);
   TFLITE_DCHECK_EQ(filter_shape.FlatSize(), filter_rows * filter_cols);
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h
index a16d39be553..ff261a873e6 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_MUL_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_MUL_H_
 
-#include "public/gemmlowp.h"
+#include "profiling/instrumentation.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/pooling.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/pooling.h
index 5037328b92a..beabd616c1b 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/pooling.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/pooling.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <assert.h>
 #include <stdint.h>
 #include <sys/types.h>
+
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
@@ -27,7 +28,7 @@ limitations under the License.
 #include <type_traits>
 
 #include "fixedpoint/fixedpoint.h"
-#include "public/gemmlowp.h"
+#include "profiling/instrumentation.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/im2col_utils.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/softmax.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/softmax.h
index 6e6cf2ef511..dc29fcb15a6 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/softmax.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/softmax.h
@@ -15,7 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_SOFTMAX_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_SOFTMAX_H_
 
-#include "public/gemmlowp.h"
+#include "fixedpoint/fixedpoint.h"
+#include "profiling/instrumentation.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h
index 6ae6bb3b764..ef2e995b714 100644
--- a/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <stdint.h>
 #include <sys/types.h>
 
+#include "public/gemmlowp.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h"
 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
@@ -526,6 +527,18 @@ void AddBiasAndEvalActivationFunction(const float* bias_data,
                                    output_activation_max);
 }
 
+template <typename Lhs, typename Rhs, typename Result>
+void Gemm(const Eigen::MatrixBase<Lhs>& lhs, const Eigen::MatrixBase<Rhs>& rhs,
+          Eigen::MatrixBase<Result>* result) {
+  if (rhs.cols() == 1) {
+    gemmlowp::ScopedProfilingLabel label("GEMV");
+    result->col(0).noalias() = lhs * rhs.col(0);
+  } else {
+    gemmlowp::ScopedProfilingLabel label("GEMM");
+    result->noalias() = lhs * rhs;
+  }
+}
+
 inline void FullyConnected(
     const FullyConnectedParams& params, const RuntimeShape& input_shape,
     const float* input_data, const RuntimeShape& weights_shape,
@@ -598,7 +611,280 @@ void FullyConnected(const float* input_data, const Dims<4>& input_dims,
                  output_data, output_dims);
 }
 
+struct GemmlowpOutputPipeline {
+  typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col>
+      ColVectorMap;
+  typedef std::tuple<gemmlowp::OutputStageBiasAddition<ColVectorMap>,
+                     gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent,
+                     gemmlowp::OutputStageClamp,
+                     gemmlowp::OutputStageSaturatingCastToUint8>
+      Pipeline;
+  static Pipeline MakeExp(const int32* bias_data, int output_rows,
+                          int32 output_offset, int32 output_multiplier,
+                          int output_left_shift, int32 output_activation_min,
+                          int32 output_activation_max) {
+    ColVectorMap bias_vector(bias_data, output_rows);
+    gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
+    bias_addition_stage.bias_vector = bias_vector;
+    gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent quantize_down_stage;
+    quantize_down_stage.result_offset_after_shift = output_offset;
+    quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
+    quantize_down_stage.result_exponent = output_left_shift;
+    gemmlowp::OutputStageClamp clamp_stage;
+    clamp_stage.min = output_activation_min;
+    clamp_stage.max = output_activation_max;
+    gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
+    return std::make_tuple(bias_addition_stage, quantize_down_stage,
+                           clamp_stage, saturating_cast_stage);
+  }
+};
+
+struct GemmlowpOutputPipelineInt8 {
+  typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col>
+      ColVectorMap;
+  typedef std::tuple<gemmlowp::OutputStageBiasAddition<ColVectorMap>,
+                     gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent,
+                     gemmlowp::OutputStageClamp,
+                     gemmlowp::OutputStageSaturatingCastToInt8>
+      Pipeline;
+  static Pipeline MakeExp(const int32* bias_data, int output_rows,
+                          int32 output_offset, int32 output_multiplier,
+                          int output_left_shift, int32 output_activation_min,
+                          int32 output_activation_max) {
+    ColVectorMap bias_vector(bias_data, output_rows);
+    gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
+    bias_addition_stage.bias_vector = bias_vector;
+    gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent quantize_down_stage;
+    quantize_down_stage.result_offset_after_shift = output_offset;
+    quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
+    quantize_down_stage.result_exponent = output_left_shift;
+    gemmlowp::OutputStageClamp clamp_stage;
+    clamp_stage.min = output_activation_min;
+    clamp_stage.max = output_activation_max;
+    gemmlowp::OutputStageSaturatingCastToInt8 saturating_cast_stage;
+    return std::make_tuple(bias_addition_stage, quantize_down_stage,
+                           clamp_stage, saturating_cast_stage);
+  }
+};
+
 #ifdef USE_NEON
+inline void LegacyFullyConnectedAsGEMVWorkerImpl(
+    const RuntimeShape& input_shape, const uint8* input_data,
+    int32 input_offset, const RuntimeShape& filter_shape,
+    const uint8* filter_data, int32 filter_offset,
+    const RuntimeShape& bias_shape, const int32* bias_data, int32 output_offset,
+    int32 output_multiplier, int output_shift, int32 output_activation_min,
+    int32 output_activation_max, const RuntimeShape& output_shape,
+    uint8* output_data, int row_start, int row_end) {
+  gemmlowp::ScopedProfilingLabel label("FullyConnectedAsGEMV/8bit");
+  TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
+  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+  const int output_dim_count = output_shape.DimensionsCount();
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(output_shape, output_dim_count - 1), 1);
+  const int input_size = FlatSizeSkipDim(input_shape, 0);
+  static constexpr int kPeel = 4;
+  const bool shift_left = (output_shift > 0);
+  for (int k = 0; k < input_size; k += 64) {
+    optimized_ops_preload_l1_stream(input_data + k);
+  }
+  for (int k = 0; k < kPeel * input_size; k += 64) {
+    optimized_ops_preload_l1_stream(filter_data + k);
+  }
+
+  TFLITE_DCHECK_GE(row_end - row_start, kPeel);
+
+  for (int out = row_start; out < row_end; out += kPeel) {
+    out = std::min(out, row_end - kPeel);
+    int32x4_t acc0 = vdupq_n_s32(0);
+    int32x4_t acc1 = acc0;
+    int32x4_t acc2 = acc0;
+    int32x4_t acc3 = acc0;
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    const int16x8_t filter_offset_vec = vdupq_n_s16(filter_offset);
+    int in = 0;
+    for (; in <= input_size - 16; in += 16) {
+      const uint8x16_t input_val_u8 = vld1q_u8(input_data + in);
+      const uint8* filter_ptr = filter_data + in + out * input_size;
+      uint8x16_t filter_val_u8_0 = vld1q_u8(filter_ptr);
+      optimized_ops_preload_l1_stream(filter_ptr + 64);
+      filter_ptr += input_size;
+      uint8x16_t filter_val_u8_1 = vld1q_u8(filter_ptr);
+      optimized_ops_preload_l1_stream(filter_ptr + 64);
+      filter_ptr += input_size;
+      uint8x16_t filter_val_u8_2 = vld1q_u8(filter_ptr);
+      optimized_ops_preload_l1_stream(filter_ptr + 64);
+      filter_ptr += input_size;
+      uint8x16_t filter_val_u8_3 = vld1q_u8(filter_ptr);
+      optimized_ops_preload_l1_stream(filter_ptr + 64);
+      int16x8_t input_val_0, input_val_1;
+      uint8x8_t low = vget_low_u8(input_val_u8);
+      uint8x8_t high = vget_high_u8(input_val_u8);
+      input_val_0 = vreinterpretq_s16_u16(vmovl_u8(low));
+      input_val_1 = vreinterpretq_s16_u16(vmovl_u8(high));
+      input_val_0 = vaddq_s16(input_val_0, input_offset_vec);
+      input_val_1 = vaddq_s16(input_val_1, input_offset_vec);
+      low = vget_low_u8(filter_val_u8_0);
+      high = vget_high_u8(filter_val_u8_0);
+      int16x8_t filter_val_0_0 = vreinterpretq_s16_u16(vmovl_u8(low));
+      int16x8_t filter_val_0_1 = vreinterpretq_s16_u16(vmovl_u8(high));
+      filter_val_0_0 = vaddq_s16(filter_val_0_0, filter_offset_vec);
+      filter_val_0_1 = vaddq_s16(filter_val_0_1, filter_offset_vec);
+      low = vget_low_u8(filter_val_u8_1);
+      high = vget_high_u8(filter_val_u8_1);
+      int16x8_t filter_val_1_0 = vreinterpretq_s16_u16(vmovl_u8(low));
+      int16x8_t filter_val_1_1 = vreinterpretq_s16_u16(vmovl_u8(high));
+      filter_val_1_0 = vaddq_s16(filter_val_1_0, filter_offset_vec);
+      filter_val_1_1 = vaddq_s16(filter_val_1_1, filter_offset_vec);
+      low = vget_low_u8(filter_val_u8_2);
+      high = vget_high_u8(filter_val_u8_2);
+      int16x8_t filter_val_2_0 = vreinterpretq_s16_u16(vmovl_u8(low));
+      int16x8_t filter_val_2_1 = vreinterpretq_s16_u16(vmovl_u8(high));
+      filter_val_2_0 = vaddq_s16(filter_val_2_0, filter_offset_vec);
+      filter_val_2_1 = vaddq_s16(filter_val_2_1, filter_offset_vec);
+      low = vget_low_u8(filter_val_u8_3);
+      high = vget_high_u8(filter_val_u8_3);
+      int16x8_t filter_val_3_0 = vreinterpretq_s16_u16(vmovl_u8(low));
+      int16x8_t filter_val_3_1 = vreinterpretq_s16_u16(vmovl_u8(high));
+      filter_val_3_0 = vaddq_s16(filter_val_3_0, filter_offset_vec);
+      filter_val_3_1 = vaddq_s16(filter_val_3_1, filter_offset_vec);
+      acc0 = vmlal_s16(acc0, vget_low_s16(filter_val_0_0),
+                       vget_low_s16(input_val_0));
+      acc1 = vmlal_s16(acc1, vget_low_s16(filter_val_1_0),
+                       vget_low_s16(input_val_0));
+      acc2 = vmlal_s16(acc2, vget_low_s16(filter_val_2_0),
+                       vget_low_s16(input_val_0));
+      acc3 = vmlal_s16(acc3, vget_low_s16(filter_val_3_0),
+                       vget_low_s16(input_val_0));
+      acc0 = vmlal_s16(acc0, vget_low_s16(filter_val_0_1),
+                       vget_low_s16(input_val_1));
+      acc1 = vmlal_s16(acc1, vget_low_s16(filter_val_1_1),
+                       vget_low_s16(input_val_1));
+      acc2 = vmlal_s16(acc2, vget_low_s16(filter_val_2_1),
+                       vget_low_s16(input_val_1));
+      acc3 = vmlal_s16(acc3, vget_low_s16(filter_val_3_1),
+                       vget_low_s16(input_val_1));
+      acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0_0),
+                       vget_high_s16(input_val_0));
+      acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1_0),
+                       vget_high_s16(input_val_0));
+      acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2_0),
+                       vget_high_s16(input_val_0));
+      acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3_0),
+                       vget_high_s16(input_val_0));
+      acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0_1),
+                       vget_high_s16(input_val_1));
+      acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1_1),
+                       vget_high_s16(input_val_1));
+      acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2_1),
+                       vget_high_s16(input_val_1));
+      acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3_1),
+                       vget_high_s16(input_val_1));
+    }
+    for (; in <= input_size - 8; in += 8) {
+      const uint8x8_t input_val_u8 = vld1_u8(input_data + in);
+      const uint8* filter_ptr = filter_data + in + out * input_size;
+      uint8x8_t filter_val_u8_0 = vld1_u8(filter_ptr);
+      filter_ptr += input_size;
+      uint8x8_t filter_val_u8_1 = vld1_u8(filter_ptr);
+      filter_ptr += input_size;
+      uint8x8_t filter_val_u8_2 = vld1_u8(filter_ptr);
+      filter_ptr += input_size;
+      uint8x8_t filter_val_u8_3 = vld1_u8(filter_ptr);
+      int16x8_t input_val = vreinterpretq_s16_u16(vmovl_u8(input_val_u8));
+      input_val = vaddq_s16(input_val, input_offset_vec);
+      int16x8_t filter_val_0 = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8_0));
+      filter_val_0 = vaddq_s16(filter_val_0, filter_offset_vec);
+      int16x8_t filter_val_1 = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8_1));
+      filter_val_1 = vaddq_s16(filter_val_1, filter_offset_vec);
+      int16x8_t filter_val_2 = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8_2));
+      filter_val_2 = vaddq_s16(filter_val_2, filter_offset_vec);
+      int16x8_t filter_val_3 = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8_3));
+      filter_val_3 = vaddq_s16(filter_val_3, filter_offset_vec);
+      acc0 =
+          vmlal_s16(acc0, vget_low_s16(filter_val_0), vget_low_s16(input_val));
+      acc1 =
+          vmlal_s16(acc1, vget_low_s16(filter_val_1), vget_low_s16(input_val));
+      acc2 =
+          vmlal_s16(acc2, vget_low_s16(filter_val_2), vget_low_s16(input_val));
+      acc3 =
+          vmlal_s16(acc3, vget_low_s16(filter_val_3), vget_low_s16(input_val));
+      acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0),
+                       vget_high_s16(input_val));
+      acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1),
+                       vget_high_s16(input_val));
+      acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2),
+                       vget_high_s16(input_val));
+      acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3),
+                       vget_high_s16(input_val));
+    }
+    if (in < input_size) {
+      int32 buf[16];
+      vst1q_s32(buf + 0, acc0);
+      vst1q_s32(buf + 4, acc1);
+      vst1q_s32(buf + 8, acc2);
+      vst1q_s32(buf + 12, acc3);
+      for (; in < input_size; in++) {
+        int lane = (in + 8 - input_size) % 4;
+        const int32 input_val = input_data[in] + input_offset;
+        for (int k = 0; k < kPeel; k++) {
+          int32 filter_val =
+              filter_data[in + (out + k) * input_size] + filter_offset;
+          buf[lane + 4 * k] += filter_val * input_val;
+        }
+      }
+      acc0 = vld1q_s32(buf + 0);
+      acc1 = vld1q_s32(buf + 4);
+      acc2 = vld1q_s32(buf + 8);
+      acc3 = vld1q_s32(buf + 12);
+    }
+
+    // Horizontally reduce accumulators
+    int32x2_t pairwise_reduced_acc_0 =
+        vpadd_s32(vget_low_s32(acc0), vget_high_s32(acc0));
+    int32x2_t pairwise_reduced_acc_1 =
+        vpadd_s32(vget_low_s32(acc1), vget_high_s32(acc1));
+    int32x2_t pairwise_reduced_acc_2 =
+        vpadd_s32(vget_low_s32(acc2), vget_high_s32(acc2));
+    int32x2_t pairwise_reduced_acc_3 =
+        vpadd_s32(vget_low_s32(acc3), vget_high_s32(acc3));
+    const int32x2_t reduced_lo =
+        vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
+    const int32x2_t reduced_hi =
+        vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
+    int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
+    // Add bias values.
+    int32x4_t bias_vec = vld1q_s32(bias_data + out);
+    reduced = vaddq_s32(reduced, bias_vec);
+    if (shift_left) {
+      const int32 multiplier_power_of_two = 1 << output_shift;
+      reduced = vmulq_n_s32(reduced, multiplier_power_of_two);
+      reduced = vqrdmulhq_n_s32(reduced, output_multiplier);
+    } else {
+      // Multiply by the fixed-point multiplier.
+      reduced = vqrdmulhq_n_s32(reduced, output_multiplier);
+      // Rounding-shift-right.
+      using gemmlowp::RoundingDivideByPOT;
+      reduced = RoundingDivideByPOT(reduced, -output_shift);
+    }
+    // Add the output offset.
+    const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
+    reduced = vaddq_s32(reduced, output_offset_vec);
+    // Narrow values down to 16 bit signed.
+    const int16x4_t res16 = vqmovn_s32(reduced);
+    // Narrow values down to 8 bit unsigned, saturating.
+    uint8x8_t res8 = vqmovun_s16(vcombine_s16(res16, res16));
+    // Apply the clamping from the activation function
+    res8 = vmax_u8(res8, vdup_n_u8(output_activation_min));
+    res8 = vmin_u8(res8, vdup_n_u8(output_activation_max));
+    // Store results to destination.
+    vst1_lane_u8(output_data + out + 0, res8, 0);
+    vst1_lane_u8(output_data + out + 1, res8, 1);
+    vst1_lane_u8(output_data + out + 2, res8, 2);
+    vst1_lane_u8(output_data + out + 3, res8, 3);
+  }
+}
+
 struct LegacyFullyConnectedAsGEMVWorkerTask : public gemmlowp::Task {
   LegacyFullyConnectedAsGEMVWorkerTask(
       const RuntimeShape& input_shape, const uint8* input_data,
@@ -628,7 +914,7 @@ struct LegacyFullyConnectedAsGEMVWorkerTask : public gemmlowp::Task {
         row_end_(row_end) {}
 
   void Run() override {
-    FullyConnectedAsGEMVWorkerImpl(
+    LegacyFullyConnectedAsGEMVWorkerImpl(
         input_shape_, input_data_, input_offset_, filter_shape_, filter_data_,
         filter_offset_, bias_shape_, bias_data_, output_offset_,
         output_multiplier_, output_shift_, output_activation_min_,
@@ -673,7 +959,7 @@ inline void FullyConnectedAsGEMV(
   if (thread_count == 1) {
     // Single-thread case: do the computation on the current thread, don't
     // use a threadpool
-    FullyConnectedAsGEMVWorkerImpl(
+    LegacyFullyConnectedAsGEMVWorkerImpl(
         input_shape, input_data, input_offset, filter_shape, filter_data,
         filter_offset, bias_shape, bias_data, output_offset, output_multiplier,
         output_shift, output_activation_min, output_activation_max,
@@ -760,6 +1046,491 @@ inline void FullyConnected(
       filter_offset, input_offset, output_pipeline);
 }
 
+#ifdef GEMMLOWP_NEON
+// In the common case of batch size 1, a fully-connected node degenerates
+// to a matrix*vector product. LSTM cells contain a fully-connected node;
+// when quantized, this becomes a special type of GEMV operation where
+// the output is 16bit-quantized, thus needs its own special path.
+inline void GEMVForLstmCell(const RuntimeShape& input_shape,
+                            const uint8* input_data,
+                            const RuntimeShape& weights_shape,
+                            const uint8* weights_data, uint8 weights_zero_point,
+                            const RuntimeShape& bias_shape,
+                            const int32* bias_data, int32 accum_multiplier,
+                            int accum_shift, const RuntimeShape& output_shape,
+                            int16* output_data) {
+  gemmlowp::ScopedProfilingLabel label("GEMVForLstmCell");
+  TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
+  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int weights_dim_count = weights_shape.DimensionsCount();
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(output_shape, output_dim_count - 1), 1);
+  const int input_size = FlatSizeSkipDim(input_shape, 0);
+  const int output_size = MatchingDim(weights_shape, weights_dim_count - 2,
+                                      output_shape, output_dim_count - 1);
+  // This special fast path for quantized LSTM cells does not try to support
+  // odd sizes that we haven't encountered in any LSTM cell, that would
+  // require special code (that would go untested until any LSTM cell
+  // exercises it). We just guard our assumptions about size evenness with
+  // the following assertions.
+  TFLITE_DCHECK(!(output_size % 4));
+  TFLITE_DCHECK(!(input_size % 8));
+  const int32* bias_ptr = bias_data;
+  int16* output_ptr = output_data;
+  for (int out = 0; out < output_size; out += 4) {
+    int32x4_t acc_0 = vdupq_n_s32(0);
+    int32x4_t acc_1 = vdupq_n_s32(0);
+    int32x4_t acc_2 = vdupq_n_s32(0);
+    int32x4_t acc_3 = vdupq_n_s32(0);
+    const int16x8_t input_offset_vec = vdupq_n_s16(-128);
+    const int16x8_t weights_offset_vec = vdupq_n_s16(-weights_zero_point);
+    int in = 0;
+    // Handle 16 levels of depth at a time.
+    for (; in <= input_size - 16; in += 16) {
+      const uint8x16_t input_val_u8 = vld1q_u8(input_data + in);
+      const uint8* weights_ptr = weights_data + in + out * input_size;
+      uint8x16_t weights_val_u8_0 = vld1q_u8(weights_ptr + 0 * input_size);
+      uint8x16_t weights_val_u8_1 = vld1q_u8(weights_ptr + 1 * input_size);
+      uint8x16_t weights_val_u8_2 = vld1q_u8(weights_ptr + 2 * input_size);
+      uint8x16_t weights_val_u8_3 = vld1q_u8(weights_ptr + 3 * input_size);
+      int16x8_t input_val_0, input_val_1;
+      const uint8x8_t low = vget_low_u8(input_val_u8);
+      const uint8x8_t high = vget_high_u8(input_val_u8);
+      input_val_0 = vreinterpretq_s16_u16(vmovl_u8(low));
+      input_val_1 = vreinterpretq_s16_u16(vmovl_u8(high));
+      input_val_0 = vaddq_s16(input_val_0, input_offset_vec);
+      input_val_1 = vaddq_s16(input_val_1, input_offset_vec);
+      int16x8_t weights_val_0_0, weights_val_1_0, weights_val_2_0,
+          weights_val_3_0;
+      int16x8_t weights_val_0_1, weights_val_1_1, weights_val_2_1,
+          weights_val_3_1;
+      weights_val_0_0 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_0))),
+          weights_offset_vec);
+      weights_val_0_1 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_0))),
+          weights_offset_vec);
+      weights_val_1_0 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_1))),
+          weights_offset_vec);
+      weights_val_1_1 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_1))),
+          weights_offset_vec);
+      weights_val_2_0 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_2))),
+          weights_offset_vec);
+      weights_val_2_1 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_2))),
+          weights_offset_vec);
+      weights_val_3_0 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_3))),
+          weights_offset_vec);
+      weights_val_3_1 = vaddq_s16(
+          vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_3))),
+          weights_offset_vec);
+      acc_0 = vmlal_s16(acc_0, vget_low_s16(weights_val_0_0),
+                        vget_low_s16(input_val_0));
+      acc_1 = vmlal_s16(acc_1, vget_low_s16(weights_val_1_0),
+                        vget_low_s16(input_val_0));
+      acc_2 = vmlal_s16(acc_2, vget_low_s16(weights_val_2_0),
+                        vget_low_s16(input_val_0));
+      acc_3 = vmlal_s16(acc_3, vget_low_s16(weights_val_3_0),
+                        vget_low_s16(input_val_0));
+      acc_0 = vmlal_s16(acc_0, vget_high_s16(weights_val_0_0),
+                        vget_high_s16(input_val_0));
+      acc_1 = vmlal_s16(acc_1, vget_high_s16(weights_val_1_0),
+                        vget_high_s16(input_val_0));
+      acc_2 = vmlal_s16(acc_2, vget_high_s16(weights_val_2_0),
+                        vget_high_s16(input_val_0));
+      acc_3 = vmlal_s16(acc_3, vget_high_s16(weights_val_3_0),
+                        vget_high_s16(input_val_0));
+      acc_0 = vmlal_s16(acc_0, vget_low_s16(weights_val_0_1),
+                        vget_low_s16(input_val_1));
+      acc_1 = vmlal_s16(acc_1, vget_low_s16(weights_val_1_1),
+                        vget_low_s16(input_val_1));
+      acc_2 = vmlal_s16(acc_2, vget_low_s16(weights_val_2_1),
+                        vget_low_s16(input_val_1));
+      acc_3 = vmlal_s16(acc_3, vget_low_s16(weights_val_3_1),
+                        vget_low_s16(input_val_1));
+      acc_0 = vmlal_s16(acc_0, vget_high_s16(weights_val_0_1),
+                        vget_high_s16(input_val_1));
+      acc_1 = vmlal_s16(acc_1, vget_high_s16(weights_val_1_1),
+                        vget_high_s16(input_val_1));
+      acc_2 = vmlal_s16(acc_2, vget_high_s16(weights_val_2_1),
+                        vget_high_s16(input_val_1));
+      acc_3 = vmlal_s16(acc_3, vget_high_s16(weights_val_3_1),
+                        vget_high_s16(input_val_1));
+    }
+    // Handle 8 levels of depth at a time.
+    for (; in < input_size; in += 8) {
+      const uint8x8_t input_val_u8 = vld1_u8(input_data + in);
+      const uint8* weights_ptr = weights_data + in + out * input_size;
+      uint8x8_t weights_val_u8_0 = vld1_u8(weights_ptr + 0 * input_size);
+      uint8x8_t weights_val_u8_1 = vld1_u8(weights_ptr + 1 * input_size);
+      uint8x8_t weights_val_u8_2 = vld1_u8(weights_ptr + 2 * input_size);
+      uint8x8_t weights_val_u8_3 = vld1_u8(weights_ptr + 3 * input_size);
+      int16x8_t input_val;
+      input_val = vreinterpretq_s16_u16(vmovl_u8(input_val_u8));
+      input_val = vaddq_s16(input_val, input_offset_vec);
+      int16x8_t weights_val_0, weights_val_1, weights_val_2, weights_val_3;
+      weights_val_0 =
+          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_0)),
+                    weights_offset_vec);
+      weights_val_1 =
+          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_1)),
+                    weights_offset_vec);
+      weights_val_2 =
+          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_2)),
+                    weights_offset_vec);
+      weights_val_3 =
+          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_3)),
+                    weights_offset_vec);
+      acc_0 = vmlal_s16(acc_0, vget_low_s16(weights_val_0),
+                        vget_low_s16(input_val));
+      acc_1 = vmlal_s16(acc_1, vget_low_s16(weights_val_1),
+                        vget_low_s16(input_val));
+      acc_2 = vmlal_s16(acc_2, vget_low_s16(weights_val_2),
+                        vget_low_s16(input_val));
+      acc_3 = vmlal_s16(acc_3, vget_low_s16(weights_val_3),
+                        vget_low_s16(input_val));
+      acc_0 = vmlal_s16(acc_0, vget_high_s16(weights_val_0),
+                        vget_high_s16(input_val));
+      acc_1 = vmlal_s16(acc_1, vget_high_s16(weights_val_1),
+                        vget_high_s16(input_val));
+      acc_2 = vmlal_s16(acc_2, vget_high_s16(weights_val_2),
+                        vget_high_s16(input_val));
+      acc_3 = vmlal_s16(acc_3, vget_high_s16(weights_val_3),
+                        vget_high_s16(input_val));
+    }
+    // Horizontally reduce accumulators
+    int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1,
+        pairwise_reduced_acc_2, pairwise_reduced_acc_3;
+    pairwise_reduced_acc_0 =
+        vpadd_s32(vget_low_s32(acc_0), vget_high_s32(acc_0));
+    pairwise_reduced_acc_1 =
+        vpadd_s32(vget_low_s32(acc_1), vget_high_s32(acc_1));
+    pairwise_reduced_acc_2 =
+        vpadd_s32(vget_low_s32(acc_2), vget_high_s32(acc_2));
+    pairwise_reduced_acc_3 =
+        vpadd_s32(vget_low_s32(acc_3), vget_high_s32(acc_3));
+    const int32x2_t reduced_lo =
+        vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
+    const int32x2_t reduced_hi =
+        vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
+    int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
+    // Add bias values.
+    int32x4_t bias_vec = vld1q_s32(bias_ptr);
+    bias_ptr += 4;
+    reduced = vaddq_s32(reduced, bias_vec);
+    int left_shift = accum_shift > 0 ? accum_shift : 0;
+    int right_shift = accum_shift > 0 ? 0 : -accum_shift;
+    reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift));
+    // Multiply by the fixed-point multiplier.
+    reduced = vqrdmulhq_n_s32(reduced, accum_multiplier);
+    // Rounding-shift-right.
+    using gemmlowp::RoundingDivideByPOT;
+    reduced = RoundingDivideByPOT(reduced, right_shift);
+    // Narrow values down to 16 bit signed.
+    const int16x4_t res16 = vqmovn_s32(reduced);
+    vst1_s16(output_ptr, res16);
+    output_ptr += 4;
+  }
+}
+#endif
+
+#ifdef GEMMLOWP_NEON
+inline void GEMVForLstmCellWithSymmetricRange(
+    const RuntimeShape& input_shape, const uint8* input_data,
+    const RuntimeShape& weights_shape, const uint8* weights_data,
+    const RuntimeShape& bias_shape, const int32* bias_data,
+    int32 accum_multiplier, int accum_shift, const RuntimeShape& output_shape,
+    int16* output_data) {
+  gemmlowp::ScopedProfilingLabel label("GEMVForLstmCellWithSymmetricRange");
+  TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
+  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int weights_dim_count = weights_shape.DimensionsCount();
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(output_shape, output_dim_count - 1), 1);
+  const int input_size = FlatSizeSkipDim(input_shape, 0);
+  const int output_size = MatchingDim(weights_shape, weights_dim_count - 2,
+                                      output_shape, output_dim_count - 1);
+  // This special fast path for quantized LSTM cells does not try to support
+  // odd sizes that we haven't encountered in any LSTM cell, that would
+  // require special code (that would go untested until any LSTM cell
+  // exercises it). We just guard our assumptions about size evenness with
+  // the following assertions.
+  TFLITE_DCHECK(!(output_size % 4));
+  TFLITE_DCHECK(!(input_size % 64));
+  const int32* bias_ptr = bias_data;
+  int16* output_ptr = output_data;
+  const uint8x16_t signbit = vdupq_n_u8(0x80);
+  for (int in = 0; in < input_size; in += 32) {
+    optimized_ops_preload_l1_keep(input_data + in);
+  }
+  const int left_shift = accum_shift > 0 ? accum_shift : 0;
+  const int right_shift = accum_shift > 0 ? 0 : -accum_shift;
+  for (int out = 0; out < output_size; out += 4) {
+    // Load the bias values
+    int32x4_t bias_vec = vld1q_s32(bias_ptr);
+    bias_ptr += 4;
+
+    // Clear accumulators. We use 2 accumulator registers per row,
+    // for 4 rows. row_accumRN is the N-th accumulator for row R.
+    int32x4_t row_accum00 = vdupq_n_s32(0);
+    int32x4_t row_accum01 = vdupq_n_s32(0);
+    int32x4_t row_accum10 = vdupq_n_s32(0);
+    int32x4_t row_accum11 = vdupq_n_s32(0);
+    int32x4_t row_accum20 = vdupq_n_s32(0);
+    int32x4_t row_accum21 = vdupq_n_s32(0);
+    int32x4_t row_accum30 = vdupq_n_s32(0);
+    int32x4_t row_accum31 = vdupq_n_s32(0);
+
+    // kReadAhead parametrizes how far ahead we prefetch weights into L1 cache.
+    const int kReadAhead = 512;
+    // Prefetch the first weights values.
+    for (int k = 0; k < kReadAhead; k += 64) {
+      optimized_ops_preload_l1_stream(weights_data + (out + 0) * input_size +
+                                      k);
+      optimized_ops_preload_l1_stream(weights_data + (out + 1) * input_size +
+                                      k);
+      optimized_ops_preload_l1_stream(weights_data + (out + 2) * input_size +
+                                      k);
+      optimized_ops_preload_l1_stream(weights_data + (out + 3) * input_size +
+                                      k);
+    }
+    // Loop along the rows, handling 64 bytes per iteration because that's
+    // cache line size on most current ARM-architecture CPUs.
+    for (int in = 0; in < input_size; in += 64) {
+      // Prefetch some future weights values.
+      optimized_ops_preload_l1_stream(weights_data + (out + 0) * input_size +
+                                      in + kReadAhead);
+      optimized_ops_preload_l1_stream(weights_data + (out + 1) * input_size +
+                                      in + kReadAhead);
+      optimized_ops_preload_l1_stream(weights_data + (out + 2) * input_size +
+                                      in + kReadAhead);
+      optimized_ops_preload_l1_stream(weights_data + (out + 3) * input_size +
+                                      in + kReadAhead);
+
+      // We will use 2 local 16-bit accumulators per row, for 2 rows.
+      // See below (*) for the rationale of processing only 2 rows at a time.
+      // local_accumRN is the N-th local accumulator for row R.
+      int16x8_t local_accum00;
+      int16x8_t local_accum01;
+      int16x8_t local_accum10;
+      int16x8_t local_accum11;
+
+      // Load 64 bytes of input activations values. Convert to signed int8
+      // by flipping the sign bit (i.e. subtracting 128, the required
+      // zero_point value).
+      int8x16_t input0 = vreinterpretq_s8_u8(
+          veorq_u8(signbit, vld1q_u8(input_data + in + 16 * 0)));
+      int8x16_t input1 = vreinterpretq_s8_u8(
+          veorq_u8(signbit, vld1q_u8(input_data + in + 16 * 1)));
+      int8x16_t input2 = vreinterpretq_s8_u8(
+          veorq_u8(signbit, vld1q_u8(input_data + in + 16 * 2)));
+      int8x16_t input3 = vreinterpretq_s8_u8(
+          veorq_u8(signbit, vld1q_u8(input_data + in + 16 * 3)));
+
+      // Beginning of the core accumulation. Notice how while we have 4
+      // rows to process, this code is taking care of only 2 rows at a time,
+      // thus being divided into two parts looking similar ("Rows 0 and 1" and
+      // "Rows 2 and 3").
+      //
+      // (*) The rationale for handling only 2 rows at a time is to avoid
+      // cache aliasing issues on 4-way set-associative L1-cache CPUs, such
+      // as Cortex-A53. With sufficiently large, power-of-two matrix dimensions,
+      // we may find ourselves in a situation where rows alias each other in
+      // the L1 cache, and moreover may also mutually alias with the input
+      // activations. If we try to load 4 rows at a time, together with the
+      // input activations, that may be 5 mutually-aliasing vectors, resulting
+      // in constant mutual eviction from L1 cache. Handling 2 rows at a time
+      // here largely mitigates these issues, and seems at least to be very
+      // effective on Cortex-A53:
+      //                          Before       After
+      // big (Cortex-A73)         2.85 ms      2.85 ms
+      // little (Cortex-A53)      11.0 ms      5.16 ms
+
+      // Rows 0 and 1:
+      // Load 64 bytes of weights values from each row. Convert to signed int8
+      // by flipping the sign bit (i.e. subtracting 128, the required
+      // zero_point value).
+      int8x16_t weights00 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 0) * input_size + in + 16 * 0)));
+      int8x16_t weights01 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 0) * input_size + in + 16 * 1)));
+      int8x16_t weights02 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 0) * input_size + in + 16 * 2)));
+      int8x16_t weights03 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 0) * input_size + in + 16 * 3)));
+      int8x16_t weights10 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 1) * input_size + in + 16 * 0)));
+      int8x16_t weights11 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 1) * input_size + in + 16 * 1)));
+      int8x16_t weights12 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 1) * input_size + in + 16 * 2)));
+      int8x16_t weights13 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 1) * input_size + in + 16 * 3)));
+      // Multiply-accumulate into local 16-bit accumulators.
+      // We can accumulate two products without overflow because weights are
+      // required to never be -128, so each product is at most 127^2 in absolute
+      // value.
+      local_accum00 = vmull_s8(vget_low_s8(weights00), vget_low_s8(input0));
+      local_accum01 = vmull_s8(vget_low_s8(weights01), vget_low_s8(input1));
+      local_accum10 = vmull_s8(vget_low_s8(weights10), vget_low_s8(input0));
+      local_accum11 = vmull_s8(vget_low_s8(weights11), vget_low_s8(input1));
+      local_accum00 = vmlal_s8(local_accum00, vget_high_s8(weights00),
+                               vget_high_s8(input0));
+      local_accum01 = vmlal_s8(local_accum01, vget_high_s8(weights01),
+                               vget_high_s8(input1));
+      local_accum10 = vmlal_s8(local_accum10, vget_high_s8(weights10),
+                               vget_high_s8(input0));
+      local_accum11 = vmlal_s8(local_accum11, vget_high_s8(weights11),
+                               vget_high_s8(input1));
+      // Pairwise add and accumulate into 32-bit accumulators
+      row_accum00 = vpadalq_s16(row_accum00, local_accum00);
+      row_accum01 = vpadalq_s16(row_accum01, local_accum01);
+      row_accum10 = vpadalq_s16(row_accum10, local_accum10);
+      row_accum11 = vpadalq_s16(row_accum11, local_accum11);
+      // Multiply-accumulate into local 16-bit accumulators.
+      // We can accumulate two products without overflow because weights are
+      // required to never be -128, so each product is at most 127^2 in absolute
+      // value.
+      local_accum00 = vmull_s8(vget_low_s8(weights02), vget_low_s8(input2));
+      local_accum01 = vmull_s8(vget_low_s8(weights03), vget_low_s8(input3));
+      local_accum10 = vmull_s8(vget_low_s8(weights12), vget_low_s8(input2));
+      local_accum11 = vmull_s8(vget_low_s8(weights13), vget_low_s8(input3));
+      local_accum00 = vmlal_s8(local_accum00, vget_high_s8(weights02),
+                               vget_high_s8(input2));
+      local_accum01 = vmlal_s8(local_accum01, vget_high_s8(weights03),
+                               vget_high_s8(input3));
+      local_accum10 = vmlal_s8(local_accum10, vget_high_s8(weights12),
+                               vget_high_s8(input2));
+      local_accum11 = vmlal_s8(local_accum11, vget_high_s8(weights13),
+                               vget_high_s8(input3));
+      // Pairwise add and accumulate into 32-bit accumulators
+      row_accum00 = vpadalq_s16(row_accum00, local_accum00);
+      row_accum01 = vpadalq_s16(row_accum01, local_accum01);
+      row_accum10 = vpadalq_s16(row_accum10, local_accum10);
+      row_accum11 = vpadalq_s16(row_accum11, local_accum11);
+
+      // Rows 2 and 3:
+      // Load 64 bytes of weights values from each row. Convert to signed int8
+      // by flipping the sign bit (i.e. subtracting 128, the required
+      // zero_point value).
+      weights00 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 2) * input_size + in + 16 * 0)));
+      weights01 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 2) * input_size + in + 16 * 1)));
+      weights02 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 2) * input_size + in + 16 * 2)));
+      weights03 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 2) * input_size + in + 16 * 3)));
+      weights10 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 3) * input_size + in + 16 * 0)));
+      weights11 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 3) * input_size + in + 16 * 1)));
+      weights12 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 3) * input_size + in + 16 * 2)));
+      weights13 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 3) * input_size + in + 16 * 3)));
+      // Multiply-accumulate into local 16-bit accumulators.
+      // We can accumulate two products without overflow because weights are
+      // required to never be -128, so each product is at most 127^2 in absolute
+      // value.
+      local_accum00 = vmull_s8(vget_low_s8(weights00), vget_low_s8(input0));
+      local_accum01 = vmull_s8(vget_low_s8(weights01), vget_low_s8(input1));
+      local_accum10 = vmull_s8(vget_low_s8(weights10), vget_low_s8(input0));
+      local_accum11 = vmull_s8(vget_low_s8(weights11), vget_low_s8(input1));
+      local_accum00 = vmlal_s8(local_accum00, vget_high_s8(weights00),
+                               vget_high_s8(input0));
+      local_accum01 = vmlal_s8(local_accum01, vget_high_s8(weights01),
+                               vget_high_s8(input1));
+      local_accum10 = vmlal_s8(local_accum10, vget_high_s8(weights10),
+                               vget_high_s8(input0));
+      local_accum11 = vmlal_s8(local_accum11, vget_high_s8(weights11),
+                               vget_high_s8(input1));
+      // Pairwise add and accumulate into 32-bit accumulators
+      row_accum20 = vpadalq_s16(row_accum20, local_accum00);
+      row_accum21 = vpadalq_s16(row_accum21, local_accum01);
+      row_accum30 = vpadalq_s16(row_accum30, local_accum10);
+      row_accum31 = vpadalq_s16(row_accum31, local_accum11);
+      // Multiply-accumulate into local 16-bit accumulators.
+      // We can accumulate two products without overflow because weights are
+      // required to never be -128, so each product is at most 127^2 in absolute
+      // value.
+      local_accum00 = vmull_s8(vget_low_s8(weights02), vget_low_s8(input2));
+      local_accum01 = vmull_s8(vget_low_s8(weights03), vget_low_s8(input3));
+      local_accum10 = vmull_s8(vget_low_s8(weights12), vget_low_s8(input2));
+      local_accum11 = vmull_s8(vget_low_s8(weights13), vget_low_s8(input3));
+      local_accum00 = vmlal_s8(local_accum00, vget_high_s8(weights02),
+                               vget_high_s8(input2));
+      local_accum01 = vmlal_s8(local_accum01, vget_high_s8(weights03),
+                               vget_high_s8(input3));
+      local_accum10 = vmlal_s8(local_accum10, vget_high_s8(weights12),
+                               vget_high_s8(input2));
+      local_accum11 = vmlal_s8(local_accum11, vget_high_s8(weights13),
+                               vget_high_s8(input3));
+      // Pairwise add and accumulate into 32-bit accumulators
+      row_accum20 = vpadalq_s16(row_accum20, local_accum00);
+      row_accum21 = vpadalq_s16(row_accum21, local_accum01);
+      row_accum30 = vpadalq_s16(row_accum30, local_accum10);
+      row_accum31 = vpadalq_s16(row_accum31, local_accum11);
+    }
+
+    row_accum00 = vaddq_s32(row_accum00, row_accum01);
+    row_accum10 = vaddq_s32(row_accum10, row_accum11);
+    row_accum20 = vaddq_s32(row_accum20, row_accum21);
+    row_accum30 = vaddq_s32(row_accum30, row_accum31);
+    // Horizontally reduce accumulators
+    int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1,
+        pairwise_reduced_acc_2, pairwise_reduced_acc_3;
+    pairwise_reduced_acc_0 =
+        vpadd_s32(vget_low_s32(row_accum00), vget_high_s32(row_accum00));
+    pairwise_reduced_acc_1 =
+        vpadd_s32(vget_low_s32(row_accum10), vget_high_s32(row_accum10));
+    pairwise_reduced_acc_2 =
+        vpadd_s32(vget_low_s32(row_accum20), vget_high_s32(row_accum20));
+    pairwise_reduced_acc_3 =
+        vpadd_s32(vget_low_s32(row_accum30), vget_high_s32(row_accum30));
+    const int32x2_t reduced_lo =
+        vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
+    const int32x2_t reduced_hi =
+        vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
+    int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
+    // Add bias values.
+    reduced = vaddq_s32(reduced, bias_vec);
+    reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift));
+    // Multiply by the fixed-point multiplier.
+    reduced = vqrdmulhq_n_s32(reduced, accum_multiplier);
+    // Rounding-shift-right.
+    using gemmlowp::RoundingDivideByPOT;
+    reduced = RoundingDivideByPOT(reduced, right_shift);
+    // Narrow values down to 16 bit signed.
+    const int16x4_t res16 = vqmovn_s32(reduced);
+    vst1_s16(output_ptr, res16);
+    output_ptr += 4;
+  }
+}
+#endif
+
 inline void FullyConnected(
     const FullyConnectedParams& params, const RuntimeShape& input_shape,
     const uint8* input_data, const RuntimeShape& filter_shape,
@@ -1283,10 +2054,9 @@ inline void FullyConnected(
       input_data, filter_cols, batches, filter_cols);
   gemmlowp::MatrixMap<int8, gemmlowp::MapOrder::ColMajor> output_matrix(
       output_data, output_rows, batches, output_rows);
-  const auto& output_pipeline =
-      optimized_integer_ops::GemmlowpOutputPipelineInt8::MakeExp(
-          bias_data, output_rows, output_offset, output_multiplier,
-          output_shift, output_activation_min, output_activation_max);
+  const auto& output_pipeline = GemmlowpOutputPipelineInt8::MakeExp(
+      bias_data, output_rows, output_offset, output_multiplier, output_shift,
+      output_activation_min, output_activation_max);
 
   gemmlowp::GemmWithOutputPipeline<
       int8, int8, gemmlowp::SignedL8R8WithLhsNonzeroBitDepthParams>(
@@ -2031,6 +2801,28 @@ void ConvAsGemm(const uint8* input_data, const Dims<4>& input_dims,
       filter_offset, input_offset, output_pipeline);
 }
 
+inline void TransposeConv(
+    const ConvParams& params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& filter_shape,
+    const float* filter_data, const RuntimeShape& output_shape,
+    float* output_data, const RuntimeShape& im2col_shape, float* im2col_data) {
+  gemmlowp::ScopedProfilingLabel label("TransposeConv");
+  // Note we could use transposed weights with forward conv for unstrided
+  // cases. But we are already getting good performance with this code as-is.
+  TFLITE_DCHECK(im2col_data);
+  TransposeIm2col(params, 0, input_shape, input_data, filter_shape,
+                  output_shape, im2col_data);
+
+  const auto im2col_matrix_map =
+      MapAsMatrixWithLastDimAsRows(im2col_data, im2col_shape);
+  const auto filter_matrix_map =
+      MapAsMatrixWithFirstDimAsCols(filter_data, filter_shape);
+  auto output_matrix_map =
+      MapAsMatrixWithLastDimAsRows(output_data, output_shape);
+
+  Gemm(filter_matrix_map.transpose(), im2col_matrix_map, &output_matrix_map);
+}
+
 inline void TransposeConv(const float* input_data, const Dims<4>& input_dims,
                           const float* filter_data, const Dims<4>& filter_dims,
                           int stride_width, int stride_height, int pad_width,
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index df114b39b02..ec4de4d886f 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -27,8 +27,7 @@ limitations under the License.
 #include <tuple>
 #include <type_traits>
 
-#include "public/gemmlowp.h"
-#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 
 #if defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
 #include <Accelerate/Accelerate.h>
@@ -37,9 +36,11 @@ limitations under the License.
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "fixedpoint/fixedpoint.h"
+#include "profiling/instrumentation.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
 #include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/im2col_utils.h"
@@ -201,586 +202,11 @@ inline void AddBiasAndEvalActivationFunction(float output_activation_min,
                                              const float* bias_data,
                                              const RuntimeShape& array_shape,
                                              float* array_data) {
-#ifdef USE_NEON
-  gemmlowp::ScopedProfilingLabel label("AddBiasAndEvalActivationFunction");
-  const int bias_size = bias_shape.FlatSize();
-  const int array_size = array_shape.FlatSize();
-  TFLITE_DCHECK_EQ((array_size % bias_size), 0);
-  float* array_ptr = array_data;
-  float* array_end_ptr = array_ptr + array_size;
-  const auto activation_min = vdupq_n_f32(output_activation_min);
-  const auto activation_max = vdupq_n_f32(output_activation_max);
-  for (; array_ptr != array_end_ptr; array_ptr += bias_size) {
-    int i = 0;
-    for (; i <= bias_size - 16; i += 16) {
-      auto b0 = vld1q_f32(bias_data + i);
-      auto b1 = vld1q_f32(bias_data + i + 4);
-      auto b2 = vld1q_f32(bias_data + i + 8);
-      auto b3 = vld1q_f32(bias_data + i + 12);
-      auto a0 = vld1q_f32(array_ptr + i);
-      auto a1 = vld1q_f32(array_ptr + i + 4);
-      auto a2 = vld1q_f32(array_ptr + i + 8);
-      auto a3 = vld1q_f32(array_ptr + i + 12);
-      auto x0 = vaddq_f32(a0, b0);
-      auto x1 = vaddq_f32(a1, b1);
-      auto x2 = vaddq_f32(a2, b2);
-      auto x3 = vaddq_f32(a3, b3);
-      x0 = vmaxq_f32(activation_min, x0);
-      x1 = vmaxq_f32(activation_min, x1);
-      x2 = vmaxq_f32(activation_min, x2);
-      x3 = vmaxq_f32(activation_min, x3);
-      x0 = vminq_f32(activation_max, x0);
-      x1 = vminq_f32(activation_max, x1);
-      x2 = vminq_f32(activation_max, x2);
-      x3 = vminq_f32(activation_max, x3);
-      vst1q_f32(array_ptr + i, x0);
-      vst1q_f32(array_ptr + i + 4, x1);
-      vst1q_f32(array_ptr + i + 8, x2);
-      vst1q_f32(array_ptr + i + 12, x3);
-    }
-    for (; i <= bias_size - 4; i += 4) {
-      auto b = vld1q_f32(bias_data + i);
-      auto a = vld1q_f32(array_ptr + i);
-      auto x = vaddq_f32(a, b);
-      x = vmaxq_f32(activation_min, x);
-      x = vminq_f32(activation_max, x);
-      vst1q_f32(array_ptr + i, x);
-    }
-    for (; i < bias_size; i++) {
-      array_ptr[i] = ActivationFunctionWithMinMax(array_ptr[i] + bias_data[i],
-                                                  output_activation_min,
-                                                  output_activation_max);
-    }
-  }
-#else  // not NEON
-  gemmlowp::ScopedProfilingLabel label("AddBiasAndEvalActivationFunction");
-  const int bias_size = bias_shape.FlatSize();
-  const int array_size = array_shape.FlatSize();
-  TFLITE_DCHECK_EQ((array_size % bias_size), 0);
-  for (int array_offset = 0; array_offset < array_size;
-       array_offset += bias_size) {
-    for (int i = 0; i < bias_size; i++) {
-      array_data[array_offset + i] = ActivationFunctionWithMinMax(
-          array_data[array_offset + i] + bias_data[i], output_activation_min,
-          output_activation_max);
-    }
-  }
-#endif
+  BiasAndClamp(output_activation_min, output_activation_max,
+               bias_shape.FlatSize(), bias_data, array_shape.FlatSize(),
+               array_data);
 }
 
-template <typename Lhs, typename Rhs, typename Result>
-void Gemm(const Eigen::MatrixBase<Lhs>& lhs, const Eigen::MatrixBase<Rhs>& rhs,
-          Eigen::MatrixBase<Result>* result) {
-  if (rhs.cols() == 1) {
-    gemmlowp::ScopedProfilingLabel label("GEMV");
-    result->col(0).noalias() = lhs * rhs.col(0);
-  } else {
-    gemmlowp::ScopedProfilingLabel label("GEMM");
-    result->noalias() = lhs * rhs;
-  }
-}
-
-inline void optimized_ops_preload_l1_stream(const uint8* ptr) {
-#ifdef GEMMLOWP_ARM_64
-  asm volatile("prfm pldl1strm, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
-#else
-  gemmlowp::Prefetch(ptr);
-#endif
-}
-
-inline void optimized_ops_preload_l1_keep(const uint8* ptr) {
-#ifdef GEMMLOWP_ARM_64
-  asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
-#else
-  gemmlowp::Prefetch(ptr);
-#endif
-}
-
-#ifdef GEMMLOWP_NEON
-// In the common case of batch size 1, a fully-connected node degenerates
-// to a matrix*vector product. LSTM cells contain a fully-connected node;
-// when quantized, this becomes a special type of GEMV operation where
-// the output is 16bit-quantized, thus needs its own special path.
-inline void GEMVForLstmCell(const RuntimeShape& input_shape,
-                            const uint8* input_data,
-                            const RuntimeShape& weights_shape,
-                            const uint8* weights_data, uint8 weights_zero_point,
-                            const RuntimeShape& bias_shape,
-                            const int32* bias_data, int32 accum_multiplier,
-                            int accum_shift, const RuntimeShape& output_shape,
-                            int16* output_data) {
-  gemmlowp::ScopedProfilingLabel label("GEMVForLstmCell");
-  TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
-  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
-  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
-  const int output_dim_count = output_shape.DimensionsCount();
-  const int weights_dim_count = weights_shape.DimensionsCount();
-  TFLITE_DCHECK_EQ(FlatSizeSkipDim(output_shape, output_dim_count - 1), 1);
-  const int input_size = FlatSizeSkipDim(input_shape, 0);
-  const int output_size = MatchingDim(weights_shape, weights_dim_count - 2,
-                                      output_shape, output_dim_count - 1);
-  // This special fast path for quantized LSTM cells does not try to support
-  // odd sizes that we haven't encountered in any LSTM cell, that would
-  // require special code (that would go untested until any LSTM cell
-  // exercises it). We just guard our assumptions about size evenness with
-  // the following assertions.
-  TFLITE_DCHECK(!(output_size % 4));
-  TFLITE_DCHECK(!(input_size % 8));
-  const int32* bias_ptr = bias_data;
-  int16* output_ptr = output_data;
-  for (int out = 0; out < output_size; out += 4) {
-    int32x4_t acc_0 = vdupq_n_s32(0);
-    int32x4_t acc_1 = vdupq_n_s32(0);
-    int32x4_t acc_2 = vdupq_n_s32(0);
-    int32x4_t acc_3 = vdupq_n_s32(0);
-    const int16x8_t input_offset_vec = vdupq_n_s16(-128);
-    const int16x8_t weights_offset_vec = vdupq_n_s16(-weights_zero_point);
-    int in = 0;
-    // Handle 16 levels of depth at a time.
-    for (; in <= input_size - 16; in += 16) {
-      const uint8x16_t input_val_u8 = vld1q_u8(input_data + in);
-      const uint8* weights_ptr = weights_data + in + out * input_size;
-      uint8x16_t weights_val_u8_0 = vld1q_u8(weights_ptr + 0 * input_size);
-      uint8x16_t weights_val_u8_1 = vld1q_u8(weights_ptr + 1 * input_size);
-      uint8x16_t weights_val_u8_2 = vld1q_u8(weights_ptr + 2 * input_size);
-      uint8x16_t weights_val_u8_3 = vld1q_u8(weights_ptr + 3 * input_size);
-      int16x8_t input_val_0, input_val_1;
-      const uint8x8_t low = vget_low_u8(input_val_u8);
-      const uint8x8_t high = vget_high_u8(input_val_u8);
-      input_val_0 = vreinterpretq_s16_u16(vmovl_u8(low));
-      input_val_1 = vreinterpretq_s16_u16(vmovl_u8(high));
-      input_val_0 = vaddq_s16(input_val_0, input_offset_vec);
-      input_val_1 = vaddq_s16(input_val_1, input_offset_vec);
-      int16x8_t weights_val_0_0, weights_val_1_0, weights_val_2_0,
-          weights_val_3_0;
-      int16x8_t weights_val_0_1, weights_val_1_1, weights_val_2_1,
-          weights_val_3_1;
-      weights_val_0_0 = vaddq_s16(
-          vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_0))),
-          weights_offset_vec);
-      weights_val_0_1 = vaddq_s16(
-          vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_0))),
-          weights_offset_vec);
-      weights_val_1_0 = vaddq_s16(
-          vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_1))),
-          weights_offset_vec);
-      weights_val_1_1 = vaddq_s16(
-          vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_1))),
-          weights_offset_vec);
-      weights_val_2_0 = vaddq_s16(
-          vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_2))),
-          weights_offset_vec);
-      weights_val_2_1 = vaddq_s16(
-          vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_2))),
-          weights_offset_vec);
-      weights_val_3_0 = vaddq_s16(
-          vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_3))),
-          weights_offset_vec);
-      weights_val_3_1 = vaddq_s16(
-          vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_3))),
-          weights_offset_vec);
-      acc_0 = vmlal_s16(acc_0, vget_low_s16(weights_val_0_0),
-                        vget_low_s16(input_val_0));
-      acc_1 = vmlal_s16(acc_1, vget_low_s16(weights_val_1_0),
-                        vget_low_s16(input_val_0));
-      acc_2 = vmlal_s16(acc_2, vget_low_s16(weights_val_2_0),
-                        vget_low_s16(input_val_0));
-      acc_3 = vmlal_s16(acc_3, vget_low_s16(weights_val_3_0),
-                        vget_low_s16(input_val_0));
-      acc_0 = vmlal_s16(acc_0, vget_high_s16(weights_val_0_0),
-                        vget_high_s16(input_val_0));
-      acc_1 = vmlal_s16(acc_1, vget_high_s16(weights_val_1_0),
-                        vget_high_s16(input_val_0));
-      acc_2 = vmlal_s16(acc_2, vget_high_s16(weights_val_2_0),
-                        vget_high_s16(input_val_0));
-      acc_3 = vmlal_s16(acc_3, vget_high_s16(weights_val_3_0),
-                        vget_high_s16(input_val_0));
-      acc_0 = vmlal_s16(acc_0, vget_low_s16(weights_val_0_1),
-                        vget_low_s16(input_val_1));
-      acc_1 = vmlal_s16(acc_1, vget_low_s16(weights_val_1_1),
-                        vget_low_s16(input_val_1));
-      acc_2 = vmlal_s16(acc_2, vget_low_s16(weights_val_2_1),
-                        vget_low_s16(input_val_1));
-      acc_3 = vmlal_s16(acc_3, vget_low_s16(weights_val_3_1),
-                        vget_low_s16(input_val_1));
-      acc_0 = vmlal_s16(acc_0, vget_high_s16(weights_val_0_1),
-                        vget_high_s16(input_val_1));
-      acc_1 = vmlal_s16(acc_1, vget_high_s16(weights_val_1_1),
-                        vget_high_s16(input_val_1));
-      acc_2 = vmlal_s16(acc_2, vget_high_s16(weights_val_2_1),
-                        vget_high_s16(input_val_1));
-      acc_3 = vmlal_s16(acc_3, vget_high_s16(weights_val_3_1),
-                        vget_high_s16(input_val_1));
-    }
-    // Handle 8 levels of depth at a time.
-    for (; in < input_size; in += 8) {
-      const uint8x8_t input_val_u8 = vld1_u8(input_data + in);
-      const uint8* weights_ptr = weights_data + in + out * input_size;
-      uint8x8_t weights_val_u8_0 = vld1_u8(weights_ptr + 0 * input_size);
-      uint8x8_t weights_val_u8_1 = vld1_u8(weights_ptr + 1 * input_size);
-      uint8x8_t weights_val_u8_2 = vld1_u8(weights_ptr + 2 * input_size);
-      uint8x8_t weights_val_u8_3 = vld1_u8(weights_ptr + 3 * input_size);
-      int16x8_t input_val;
-      input_val = vreinterpretq_s16_u16(vmovl_u8(input_val_u8));
-      input_val = vaddq_s16(input_val, input_offset_vec);
-      int16x8_t weights_val_0, weights_val_1, weights_val_2, weights_val_3;
-      weights_val_0 =
-          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_0)),
-                    weights_offset_vec);
-      weights_val_1 =
-          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_1)),
-                    weights_offset_vec);
-      weights_val_2 =
-          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_2)),
-                    weights_offset_vec);
-      weights_val_3 =
-          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_3)),
-                    weights_offset_vec);
-      acc_0 = vmlal_s16(acc_0, vget_low_s16(weights_val_0),
-                        vget_low_s16(input_val));
-      acc_1 = vmlal_s16(acc_1, vget_low_s16(weights_val_1),
-                        vget_low_s16(input_val));
-      acc_2 = vmlal_s16(acc_2, vget_low_s16(weights_val_2),
-                        vget_low_s16(input_val));
-      acc_3 = vmlal_s16(acc_3, vget_low_s16(weights_val_3),
-                        vget_low_s16(input_val));
-      acc_0 = vmlal_s16(acc_0, vget_high_s16(weights_val_0),
-                        vget_high_s16(input_val));
-      acc_1 = vmlal_s16(acc_1, vget_high_s16(weights_val_1),
-                        vget_high_s16(input_val));
-      acc_2 = vmlal_s16(acc_2, vget_high_s16(weights_val_2),
-                        vget_high_s16(input_val));
-      acc_3 = vmlal_s16(acc_3, vget_high_s16(weights_val_3),
-                        vget_high_s16(input_val));
-    }
-    // Horizontally reduce accumulators
-    int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1,
-        pairwise_reduced_acc_2, pairwise_reduced_acc_3;
-    pairwise_reduced_acc_0 =
-        vpadd_s32(vget_low_s32(acc_0), vget_high_s32(acc_0));
-    pairwise_reduced_acc_1 =
-        vpadd_s32(vget_low_s32(acc_1), vget_high_s32(acc_1));
-    pairwise_reduced_acc_2 =
-        vpadd_s32(vget_low_s32(acc_2), vget_high_s32(acc_2));
-    pairwise_reduced_acc_3 =
-        vpadd_s32(vget_low_s32(acc_3), vget_high_s32(acc_3));
-    const int32x2_t reduced_lo =
-        vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
-    const int32x2_t reduced_hi =
-        vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
-    int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
-    // Add bias values.
-    int32x4_t bias_vec = vld1q_s32(bias_ptr);
-    bias_ptr += 4;
-    reduced = vaddq_s32(reduced, bias_vec);
-    int left_shift = accum_shift > 0 ? accum_shift : 0;
-    int right_shift = accum_shift > 0 ? 0 : -accum_shift;
-    reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift));
-    // Multiply by the fixed-point multiplier.
-    reduced = vqrdmulhq_n_s32(reduced, accum_multiplier);
-    // Rounding-shift-right.
-    using gemmlowp::RoundingDivideByPOT;
-    reduced = RoundingDivideByPOT(reduced, right_shift);
-    // Narrow values down to 16 bit signed.
-    const int16x4_t res16 = vqmovn_s32(reduced);
-    vst1_s16(output_ptr, res16);
-    output_ptr += 4;
-  }
-}
-#endif
-
-#ifdef GEMMLOWP_NEON
-inline void GEMVForLstmCellWithSymmetricRange(
-    const RuntimeShape& input_shape, const uint8* input_data,
-    const RuntimeShape& weights_shape, const uint8* weights_data,
-    const RuntimeShape& bias_shape, const int32* bias_data,
-    int32 accum_multiplier, int accum_shift, const RuntimeShape& output_shape,
-    int16* output_data) {
-  gemmlowp::ScopedProfilingLabel label("GEMVForLstmCellWithSymmetricRange");
-  TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
-  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
-  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
-  const int output_dim_count = output_shape.DimensionsCount();
-  const int weights_dim_count = weights_shape.DimensionsCount();
-  TFLITE_DCHECK_EQ(FlatSizeSkipDim(output_shape, output_dim_count - 1), 1);
-  const int input_size = FlatSizeSkipDim(input_shape, 0);
-  const int output_size = MatchingDim(weights_shape, weights_dim_count - 2,
-                                      output_shape, output_dim_count - 1);
-  // This special fast path for quantized LSTM cells does not try to support
-  // odd sizes that we haven't encountered in any LSTM cell, that would
-  // require special code (that would go untested until any LSTM cell
-  // exercises it). We just guard our assumptions about size evenness with
-  // the following assertions.
-  TFLITE_DCHECK(!(output_size % 4));
-  TFLITE_DCHECK(!(input_size % 64));
-  const int32* bias_ptr = bias_data;
-  int16* output_ptr = output_data;
-  const uint8x16_t signbit = vdupq_n_u8(0x80);
-  for (int in = 0; in < input_size; in += 32) {
-    optimized_ops_preload_l1_keep(input_data + in);
-  }
-  const int left_shift = accum_shift > 0 ? accum_shift : 0;
-  const int right_shift = accum_shift > 0 ? 0 : -accum_shift;
-  for (int out = 0; out < output_size; out += 4) {
-    // Load the bias values
-    int32x4_t bias_vec = vld1q_s32(bias_ptr);
-    bias_ptr += 4;
-
-    // Clear accumulators. We use 2 accumulator registers per row,
-    // for 4 rows. row_accumRN is the N-th accumulator for row R.
-    int32x4_t row_accum00 = vdupq_n_s32(0);
-    int32x4_t row_accum01 = vdupq_n_s32(0);
-    int32x4_t row_accum10 = vdupq_n_s32(0);
-    int32x4_t row_accum11 = vdupq_n_s32(0);
-    int32x4_t row_accum20 = vdupq_n_s32(0);
-    int32x4_t row_accum21 = vdupq_n_s32(0);
-    int32x4_t row_accum30 = vdupq_n_s32(0);
-    int32x4_t row_accum31 = vdupq_n_s32(0);
-
-    // kReadAhead parametrizes how far ahead we prefetch weights into L1 cache.
-    const int kReadAhead = 512;
-    // Prefetch the first weights values.
-    for (int k = 0; k < kReadAhead; k += 64) {
-      optimized_ops_preload_l1_stream(weights_data + (out + 0) * input_size +
-                                      k);
-      optimized_ops_preload_l1_stream(weights_data + (out + 1) * input_size +
-                                      k);
-      optimized_ops_preload_l1_stream(weights_data + (out + 2) * input_size +
-                                      k);
-      optimized_ops_preload_l1_stream(weights_data + (out + 3) * input_size +
-                                      k);
-    }
-    // Loop along the rows, handling 64 bytes per iteration because that's
-    // cache line size on most current ARM-architecture CPUs.
-    for (int in = 0; in < input_size; in += 64) {
-      // Prefetch some future weights values.
-      optimized_ops_preload_l1_stream(weights_data + (out + 0) * input_size +
-                                      in + kReadAhead);
-      optimized_ops_preload_l1_stream(weights_data + (out + 1) * input_size +
-                                      in + kReadAhead);
-      optimized_ops_preload_l1_stream(weights_data + (out + 2) * input_size +
-                                      in + kReadAhead);
-      optimized_ops_preload_l1_stream(weights_data + (out + 3) * input_size +
-                                      in + kReadAhead);
-
-      // We will use 2 local 16-bit accumulators per row, for 2 rows.
-      // See below (*) for the rationale of processing only 2 rows at a time.
-      // local_accumRN is the N-th local accumulator for row R.
-      int16x8_t local_accum00;
-      int16x8_t local_accum01;
-      int16x8_t local_accum10;
-      int16x8_t local_accum11;
-
-      // Load 64 bytes of input activations values. Convert to signed int8
-      // by flipping the sign bit (i.e. subtracting 128, the required
-      // zero_point value).
-      int8x16_t input0 = vreinterpretq_s8_u8(
-          veorq_u8(signbit, vld1q_u8(input_data + in + 16 * 0)));
-      int8x16_t input1 = vreinterpretq_s8_u8(
-          veorq_u8(signbit, vld1q_u8(input_data + in + 16 * 1)));
-      int8x16_t input2 = vreinterpretq_s8_u8(
-          veorq_u8(signbit, vld1q_u8(input_data + in + 16 * 2)));
-      int8x16_t input3 = vreinterpretq_s8_u8(
-          veorq_u8(signbit, vld1q_u8(input_data + in + 16 * 3)));
-
-      // Beginning of the core accumulation. Notice how while we have 4
-      // rows to process, this code is taking care of only 2 rows at a time,
-      // thus being divided into two parts looking similar ("Rows 0 and 1" and
-      // "Rows 2 and 3").
-      //
-      // (*) The rationale for handling only 2 rows at a time is to avoid
-      // cache aliasing issues on 4-way set-associative L1-cache CPUs, such
-      // as Cortex-A53. With sufficiently large, power-of-two matrix dimensions,
-      // we may find ourselves in a situation where rows alias each other in
-      // the L1 cache, and moreover may also mutually alias with the input
-      // activations. If we try to load 4 rows at a time, together with the
-      // input activations, that may be 5 mutually-aliasing vectors, resulting
-      // in constant mutual eviction from L1 cache. Handling 2 rows at a time
-      // here largely mitigates these issues, and seems at least to be very
-      // effective on Cortex-A53:
-      //                          Before       After
-      // big (Cortex-A73)         2.85 ms      2.85 ms
-      // little (Cortex-A53)      11.0 ms      5.16 ms
-
-      // Rows 0 and 1:
-      // Load 64 bytes of weights values from each row. Convert to signed int8
-      // by flipping the sign bit (i.e. subtracting 128, the required
-      // zero_point value).
-      int8x16_t weights00 = vreinterpretq_s8_u8(veorq_u8(
-          signbit,
-          vld1q_u8(weights_data + (out + 0) * input_size + in + 16 * 0)));
-      int8x16_t weights01 = vreinterpretq_s8_u8(veorq_u8(
-          signbit,
-          vld1q_u8(weights_data + (out + 0) * input_size + in + 16 * 1)));
-      int8x16_t weights02 = vreinterpretq_s8_u8(veorq_u8(
-          signbit,
-          vld1q_u8(weights_data + (out + 0) * input_size + in + 16 * 2)));
-      int8x16_t weights03 = vreinterpretq_s8_u8(veorq_u8(
-          signbit,
-          vld1q_u8(weights_data + (out + 0) * input_size + in + 16 * 3)));
-      int8x16_t weights10 = vreinterpretq_s8_u8(veorq_u8(
-          signbit,
-          vld1q_u8(weights_data + (out + 1) * input_size + in + 16 * 0)));
-      int8x16_t weights11 = vreinterpretq_s8_u8(veorq_u8(
-          signbit,
-          vld1q_u8(weights_data + (out + 1) * input_size + in + 16 * 1)));
-      int8x16_t weights12 = vreinterpretq_s8_u8(veorq_u8(
-          signbit,
-          vld1q_u8(weights_data + (out + 1) * input_size + in + 16 * 2)));
-      int8x16_t weights13 = vreinterpretq_s8_u8(veorq_u8(
-          signbit,
-          vld1q_u8(weights_data + (out + 1) * input_size + in + 16 * 3)));
-      // Multiply-accumulate into local 16-bit accumulators.
-      // We can accumulate two products without overflow because weights are
-      // required to never be -128, so each product is at most 127^2 in absolute
-      // value.
-      local_accum00 = vmull_s8(vget_low_s8(weights00), vget_low_s8(input0));
-      local_accum01 = vmull_s8(vget_low_s8(weights01), vget_low_s8(input1));
-      local_accum10 = vmull_s8(vget_low_s8(weights10), vget_low_s8(input0));
-      local_accum11 = vmull_s8(vget_low_s8(weights11), vget_low_s8(input1));
-      local_accum00 = vmlal_s8(local_accum00, vget_high_s8(weights00),
-                               vget_high_s8(input0));
-      local_accum01 = vmlal_s8(local_accum01, vget_high_s8(weights01),
-                               vget_high_s8(input1));
-      local_accum10 = vmlal_s8(local_accum10, vget_high_s8(weights10),
-                               vget_high_s8(input0));
-      local_accum11 = vmlal_s8(local_accum11, vget_high_s8(weights11),
-                               vget_high_s8(input1));
-      // Pairwise add and accumulate into 32-bit accumulators
-      row_accum00 = vpadalq_s16(row_accum00, local_accum00);
-      row_accum01 = vpadalq_s16(row_accum01, local_accum01);
-      row_accum10 = vpadalq_s16(row_accum10, local_accum10);
-      row_accum11 = vpadalq_s16(row_accum11, local_accum11);
-      // Multiply-accumulate into local 16-bit accumulators.
-      // We can accumulate two products without overflow because weights are
-      // required to never be -128, so each product is at most 127^2 in absolute
-      // value.
-      local_accum00 = vmull_s8(vget_low_s8(weights02), vget_low_s8(input2));
-      local_accum01 = vmull_s8(vget_low_s8(weights03), vget_low_s8(input3));
-      local_accum10 = vmull_s8(vget_low_s8(weights12), vget_low_s8(input2));
-      local_accum11 = vmull_s8(vget_low_s8(weights13), vget_low_s8(input3));
-      local_accum00 = vmlal_s8(local_accum00, vget_high_s8(weights02),
-                               vget_high_s8(input2));
-      local_accum01 = vmlal_s8(local_accum01, vget_high_s8(weights03),
-                               vget_high_s8(input3));
-      local_accum10 = vmlal_s8(local_accum10, vget_high_s8(weights12),
-                               vget_high_s8(input2));
-      local_accum11 = vmlal_s8(local_accum11, vget_high_s8(weights13),
-                               vget_high_s8(input3));
-      // Pairwise add and accumulate into 32-bit accumulators
-      row_accum00 = vpadalq_s16(row_accum00, local_accum00);
-      row_accum01 = vpadalq_s16(row_accum01, local_accum01);
-      row_accum10 = vpadalq_s16(row_accum10, local_accum10);
-      row_accum11 = vpadalq_s16(row_accum11, local_accum11);
-
-      // Rows 2 and 3:
-      // Load 64 bytes of weights values from each row. Convert to signed int8
-      // by flipping the sign bit (i.e. subtracting 128, the required
-      // zero_point value).
-      weights00 = vreinterpretq_s8_u8(veorq_u8(
-          signbit,
-          vld1q_u8(weights_data + (out + 2) * input_size + in + 16 * 0)));
-      weights01 = vreinterpretq_s8_u8(veorq_u8(
-          signbit,
-          vld1q_u8(weights_data + (out + 2) * input_size + in + 16 * 1)));
-      weights02 = vreinterpretq_s8_u8(veorq_u8(
-          signbit,
-          vld1q_u8(weights_data + (out + 2) * input_size + in + 16 * 2)));
-      weights03 = vreinterpretq_s8_u8(veorq_u8(
-          signbit,
-          vld1q_u8(weights_data + (out + 2) * input_size + in + 16 * 3)));
-      weights10 = vreinterpretq_s8_u8(veorq_u8(
-          signbit,
-          vld1q_u8(weights_data + (out + 3) * input_size + in + 16 * 0)));
-      weights11 = vreinterpretq_s8_u8(veorq_u8(
-          signbit,
-          vld1q_u8(weights_data + (out + 3) * input_size + in + 16 * 1)));
-      weights12 = vreinterpretq_s8_u8(veorq_u8(
-          signbit,
-          vld1q_u8(weights_data + (out + 3) * input_size + in + 16 * 2)));
-      weights13 = vreinterpretq_s8_u8(veorq_u8(
-          signbit,
-          vld1q_u8(weights_data + (out + 3) * input_size + in + 16 * 3)));
-      // Multiply-accumulate into local 16-bit accumulators.
-      // We can accumulate two products without overflow because weights are
-      // required to never be -128, so each product is at most 127^2 in absolute
-      // value.
-      local_accum00 = vmull_s8(vget_low_s8(weights00), vget_low_s8(input0));
-      local_accum01 = vmull_s8(vget_low_s8(weights01), vget_low_s8(input1));
-      local_accum10 = vmull_s8(vget_low_s8(weights10), vget_low_s8(input0));
-      local_accum11 = vmull_s8(vget_low_s8(weights11), vget_low_s8(input1));
-      local_accum00 = vmlal_s8(local_accum00, vget_high_s8(weights00),
-                               vget_high_s8(input0));
-      local_accum01 = vmlal_s8(local_accum01, vget_high_s8(weights01),
-                               vget_high_s8(input1));
-      local_accum10 = vmlal_s8(local_accum10, vget_high_s8(weights10),
-                               vget_high_s8(input0));
-      local_accum11 = vmlal_s8(local_accum11, vget_high_s8(weights11),
-                               vget_high_s8(input1));
-      // Pairwise add and accumulate into 32-bit accumulators
-      row_accum20 = vpadalq_s16(row_accum20, local_accum00);
-      row_accum21 = vpadalq_s16(row_accum21, local_accum01);
-      row_accum30 = vpadalq_s16(row_accum30, local_accum10);
-      row_accum31 = vpadalq_s16(row_accum31, local_accum11);
-      // Multiply-accumulate into local 16-bit accumulators.
-      // We can accumulate two products without overflow because weights are
-      // required to never be -128, so each product is at most 127^2 in absolute
-      // value.
-      local_accum00 = vmull_s8(vget_low_s8(weights02), vget_low_s8(input2));
-      local_accum01 = vmull_s8(vget_low_s8(weights03), vget_low_s8(input3));
-      local_accum10 = vmull_s8(vget_low_s8(weights12), vget_low_s8(input2));
-      local_accum11 = vmull_s8(vget_low_s8(weights13), vget_low_s8(input3));
-      local_accum00 = vmlal_s8(local_accum00, vget_high_s8(weights02),
-                               vget_high_s8(input2));
-      local_accum01 = vmlal_s8(local_accum01, vget_high_s8(weights03),
-                               vget_high_s8(input3));
-      local_accum10 = vmlal_s8(local_accum10, vget_high_s8(weights12),
-                               vget_high_s8(input2));
-      local_accum11 = vmlal_s8(local_accum11, vget_high_s8(weights13),
-                               vget_high_s8(input3));
-      // Pairwise add and accumulate into 32-bit accumulators
-      row_accum20 = vpadalq_s16(row_accum20, local_accum00);
-      row_accum21 = vpadalq_s16(row_accum21, local_accum01);
-      row_accum30 = vpadalq_s16(row_accum30, local_accum10);
-      row_accum31 = vpadalq_s16(row_accum31, local_accum11);
-    }
-
-    row_accum00 = vaddq_s32(row_accum00, row_accum01);
-    row_accum10 = vaddq_s32(row_accum10, row_accum11);
-    row_accum20 = vaddq_s32(row_accum20, row_accum21);
-    row_accum30 = vaddq_s32(row_accum30, row_accum31);
-    // Horizontally reduce accumulators
-    int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1,
-        pairwise_reduced_acc_2, pairwise_reduced_acc_3;
-    pairwise_reduced_acc_0 =
-        vpadd_s32(vget_low_s32(row_accum00), vget_high_s32(row_accum00));
-    pairwise_reduced_acc_1 =
-        vpadd_s32(vget_low_s32(row_accum10), vget_high_s32(row_accum10));
-    pairwise_reduced_acc_2 =
-        vpadd_s32(vget_low_s32(row_accum20), vget_high_s32(row_accum20));
-    pairwise_reduced_acc_3 =
-        vpadd_s32(vget_low_s32(row_accum30), vget_high_s32(row_accum30));
-    const int32x2_t reduced_lo =
-        vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
-    const int32x2_t reduced_hi =
-        vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
-    int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
-    // Add bias values.
-    reduced = vaddq_s32(reduced, bias_vec);
-    reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift));
-    // Multiply by the fixed-point multiplier.
-    reduced = vqrdmulhq_n_s32(reduced, accum_multiplier);
-    // Rounding-shift-right.
-    using gemmlowp::RoundingDivideByPOT;
-    reduced = RoundingDivideByPOT(reduced, right_shift);
-    // Narrow values down to 16 bit signed.
-    const int16x4_t res16 = vqmovn_s32(reduced);
-    vst1_s16(output_ptr, res16);
-    output_ptr += 4;
-  }
-}
-#endif
-
 inline void FullyConnected(
     const FullyConnectedParams& params, const RuntimeShape& input_shape,
     const float* input_data, const RuntimeShape& weights_shape,
@@ -788,395 +214,31 @@ inline void FullyConnected(
     const float* optional_bias_data, const RuntimeShape& output_shape,
     float* output_data, CpuBackendContext* cpu_backend_context) {
   gemmlowp::ScopedProfilingLabel label("FullyConnected");
-  const float output_activation_min = params.float_activation_min;
-  const float output_activation_max = params.float_activation_max;
-
-  // TODO(b/62193649): this convoluted shape computation (determining
-  // input_rows from the weights_dims, then MapAsMatrixWithGivenNumberOfRows)
-  // is because the current --variable_batch hack consists in overwriting the
-  // 3rd dimension with the runtime batch size, as we don't keep track for each
-  // array of which dimension is the batch dimension in it.
-  // When that is fixed, this should become:
-  // const auto input_matrix_map =
-  //     MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
   const int dims_count = weights_shape.DimensionsCount();
   const int input_rows = weights_shape.Dims(dims_count - 1);
-  const auto input_matrix_map =
-      MapAsMatrixWithGivenNumberOfRows(input_data, input_shape, input_rows);
-  const auto filter_matrix_map =
-      MapAsMatrixWithLastDimAsRows(weights_data, weights_shape);
-  auto output_matrix_map =
-      MapAsMatrixWithLastDimAsRows(output_data, output_shape);
-
-  Gemm(filter_matrix_map.transpose(), input_matrix_map, &output_matrix_map);
-
-  if (optional_bias_data != nullptr) {
-    AddBiasAndEvalActivationFunction(
-        output_activation_min, output_activation_max, bias_shape,
-        optional_bias_data, output_shape, output_data);
-  } else {
-    const int flat_size = output_shape.FlatSize();
-    for (int i = 0; i < flat_size; ++i) {
-      output_data[i] = ActivationFunctionWithMinMax(
-          output_data[i], output_activation_min, output_activation_max);
-    }
-  }
+  cpu_backend_gemm::MatrixParams<float> rhs_params;
+  rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  rhs_params.rows = input_rows;
+  rhs_params.cols = input_shape.FlatSize() / input_rows;
+  TFLITE_DCHECK_EQ(input_shape.FlatSize(), rhs_params.rows * rhs_params.cols);
+  cpu_backend_gemm::MatrixParams<float> lhs_params;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.cols = weights_shape.Dims(dims_count - 1);
+  lhs_params.rows = FlatSizeSkipDim(weights_shape, dims_count - 1);
+  cpu_backend_gemm::MatrixParams<float> dst_params;
+  dst_params.order = cpu_backend_gemm::Order::kColMajor;
+  dst_params.rows = output_shape.Dims(output_shape.DimensionsCount() - 1);
+  dst_params.cols =
+      FlatSizeSkipDim(output_shape, output_shape.DimensionsCount() - 1);
+  cpu_backend_gemm::GemmParams<float, float> gemm_params;
+  gemm_params.bias = optional_bias_data;
+  gemm_params.clamp_min = params.float_activation_min;
+  gemm_params.clamp_max = params.float_activation_max;
+  cpu_backend_gemm::Gemm(lhs_params, weights_data, rhs_params, input_data,
+                         dst_params, output_data, gemm_params,
+                         cpu_backend_context);
 }
 
-#ifdef USE_NEON
-inline void FullyConnectedAsGEMVWorkerImpl(
-    const RuntimeShape& input_shape, const uint8* input_data,
-    int32 input_offset, const RuntimeShape& filter_shape,
-    const uint8* filter_data, int32 filter_offset,
-    const RuntimeShape& bias_shape, const int32* bias_data, int32 output_offset,
-    int32 output_multiplier, int output_shift, int32 output_activation_min,
-    int32 output_activation_max, const RuntimeShape& output_shape,
-    uint8* output_data, int row_start, int row_end) {
-  gemmlowp::ScopedProfilingLabel label("FullyConnectedAsGEMV/8bit");
-  TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
-  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
-  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
-  const int output_dim_count = output_shape.DimensionsCount();
-  TFLITE_DCHECK_EQ(FlatSizeSkipDim(output_shape, output_dim_count - 1), 1);
-  const int input_size = FlatSizeSkipDim(input_shape, 0);
-  static constexpr int kPeel = 4;
-  const bool shift_left = (output_shift > 0);
-  for (int k = 0; k < input_size; k += 64) {
-    optimized_ops_preload_l1_stream(input_data + k);
-  }
-  for (int k = 0; k < kPeel * input_size; k += 64) {
-    optimized_ops_preload_l1_stream(filter_data + k);
-  }
-
-  TFLITE_DCHECK_GE(row_end - row_start, kPeel);
-
-  for (int out = row_start; out < row_end; out += kPeel) {
-    out = std::min(out, row_end - kPeel);
-    int32x4_t acc0 = vdupq_n_s32(0);
-    int32x4_t acc1 = acc0;
-    int32x4_t acc2 = acc0;
-    int32x4_t acc3 = acc0;
-    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
-    const int16x8_t filter_offset_vec = vdupq_n_s16(filter_offset);
-    int in = 0;
-    for (; in <= input_size - 16; in += 16) {
-      const uint8x16_t input_val_u8 = vld1q_u8(input_data + in);
-      const uint8* filter_ptr = filter_data + in + out * input_size;
-      uint8x16_t filter_val_u8_0 = vld1q_u8(filter_ptr);
-      optimized_ops_preload_l1_stream(filter_ptr + 64);
-      filter_ptr += input_size;
-      uint8x16_t filter_val_u8_1 = vld1q_u8(filter_ptr);
-      optimized_ops_preload_l1_stream(filter_ptr + 64);
-      filter_ptr += input_size;
-      uint8x16_t filter_val_u8_2 = vld1q_u8(filter_ptr);
-      optimized_ops_preload_l1_stream(filter_ptr + 64);
-      filter_ptr += input_size;
-      uint8x16_t filter_val_u8_3 = vld1q_u8(filter_ptr);
-      optimized_ops_preload_l1_stream(filter_ptr + 64);
-      int16x8_t input_val_0, input_val_1;
-      uint8x8_t low = vget_low_u8(input_val_u8);
-      uint8x8_t high = vget_high_u8(input_val_u8);
-      input_val_0 = vreinterpretq_s16_u16(vmovl_u8(low));
-      input_val_1 = vreinterpretq_s16_u16(vmovl_u8(high));
-      input_val_0 = vaddq_s16(input_val_0, input_offset_vec);
-      input_val_1 = vaddq_s16(input_val_1, input_offset_vec);
-      low = vget_low_u8(filter_val_u8_0);
-      high = vget_high_u8(filter_val_u8_0);
-      int16x8_t filter_val_0_0 = vreinterpretq_s16_u16(vmovl_u8(low));
-      int16x8_t filter_val_0_1 = vreinterpretq_s16_u16(vmovl_u8(high));
-      filter_val_0_0 = vaddq_s16(filter_val_0_0, filter_offset_vec);
-      filter_val_0_1 = vaddq_s16(filter_val_0_1, filter_offset_vec);
-      low = vget_low_u8(filter_val_u8_1);
-      high = vget_high_u8(filter_val_u8_1);
-      int16x8_t filter_val_1_0 = vreinterpretq_s16_u16(vmovl_u8(low));
-      int16x8_t filter_val_1_1 = vreinterpretq_s16_u16(vmovl_u8(high));
-      filter_val_1_0 = vaddq_s16(filter_val_1_0, filter_offset_vec);
-      filter_val_1_1 = vaddq_s16(filter_val_1_1, filter_offset_vec);
-      low = vget_low_u8(filter_val_u8_2);
-      high = vget_high_u8(filter_val_u8_2);
-      int16x8_t filter_val_2_0 = vreinterpretq_s16_u16(vmovl_u8(low));
-      int16x8_t filter_val_2_1 = vreinterpretq_s16_u16(vmovl_u8(high));
-      filter_val_2_0 = vaddq_s16(filter_val_2_0, filter_offset_vec);
-      filter_val_2_1 = vaddq_s16(filter_val_2_1, filter_offset_vec);
-      low = vget_low_u8(filter_val_u8_3);
-      high = vget_high_u8(filter_val_u8_3);
-      int16x8_t filter_val_3_0 = vreinterpretq_s16_u16(vmovl_u8(low));
-      int16x8_t filter_val_3_1 = vreinterpretq_s16_u16(vmovl_u8(high));
-      filter_val_3_0 = vaddq_s16(filter_val_3_0, filter_offset_vec);
-      filter_val_3_1 = vaddq_s16(filter_val_3_1, filter_offset_vec);
-      acc0 = vmlal_s16(acc0, vget_low_s16(filter_val_0_0),
-                       vget_low_s16(input_val_0));
-      acc1 = vmlal_s16(acc1, vget_low_s16(filter_val_1_0),
-                       vget_low_s16(input_val_0));
-      acc2 = vmlal_s16(acc2, vget_low_s16(filter_val_2_0),
-                       vget_low_s16(input_val_0));
-      acc3 = vmlal_s16(acc3, vget_low_s16(filter_val_3_0),
-                       vget_low_s16(input_val_0));
-      acc0 = vmlal_s16(acc0, vget_low_s16(filter_val_0_1),
-                       vget_low_s16(input_val_1));
-      acc1 = vmlal_s16(acc1, vget_low_s16(filter_val_1_1),
-                       vget_low_s16(input_val_1));
-      acc2 = vmlal_s16(acc2, vget_low_s16(filter_val_2_1),
-                       vget_low_s16(input_val_1));
-      acc3 = vmlal_s16(acc3, vget_low_s16(filter_val_3_1),
-                       vget_low_s16(input_val_1));
-      acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0_0),
-                       vget_high_s16(input_val_0));
-      acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1_0),
-                       vget_high_s16(input_val_0));
-      acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2_0),
-                       vget_high_s16(input_val_0));
-      acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3_0),
-                       vget_high_s16(input_val_0));
-      acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0_1),
-                       vget_high_s16(input_val_1));
-      acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1_1),
-                       vget_high_s16(input_val_1));
-      acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2_1),
-                       vget_high_s16(input_val_1));
-      acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3_1),
-                       vget_high_s16(input_val_1));
-    }
-    for (; in <= input_size - 8; in += 8) {
-      const uint8x8_t input_val_u8 = vld1_u8(input_data + in);
-      const uint8* filter_ptr = filter_data + in + out * input_size;
-      uint8x8_t filter_val_u8_0 = vld1_u8(filter_ptr);
-      filter_ptr += input_size;
-      uint8x8_t filter_val_u8_1 = vld1_u8(filter_ptr);
-      filter_ptr += input_size;
-      uint8x8_t filter_val_u8_2 = vld1_u8(filter_ptr);
-      filter_ptr += input_size;
-      uint8x8_t filter_val_u8_3 = vld1_u8(filter_ptr);
-      int16x8_t input_val = vreinterpretq_s16_u16(vmovl_u8(input_val_u8));
-      input_val = vaddq_s16(input_val, input_offset_vec);
-      int16x8_t filter_val_0 = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8_0));
-      filter_val_0 = vaddq_s16(filter_val_0, filter_offset_vec);
-      int16x8_t filter_val_1 = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8_1));
-      filter_val_1 = vaddq_s16(filter_val_1, filter_offset_vec);
-      int16x8_t filter_val_2 = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8_2));
-      filter_val_2 = vaddq_s16(filter_val_2, filter_offset_vec);
-      int16x8_t filter_val_3 = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8_3));
-      filter_val_3 = vaddq_s16(filter_val_3, filter_offset_vec);
-      acc0 =
-          vmlal_s16(acc0, vget_low_s16(filter_val_0), vget_low_s16(input_val));
-      acc1 =
-          vmlal_s16(acc1, vget_low_s16(filter_val_1), vget_low_s16(input_val));
-      acc2 =
-          vmlal_s16(acc2, vget_low_s16(filter_val_2), vget_low_s16(input_val));
-      acc3 =
-          vmlal_s16(acc3, vget_low_s16(filter_val_3), vget_low_s16(input_val));
-      acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0),
-                       vget_high_s16(input_val));
-      acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1),
-                       vget_high_s16(input_val));
-      acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2),
-                       vget_high_s16(input_val));
-      acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3),
-                       vget_high_s16(input_val));
-    }
-    if (in < input_size) {
-      int32 buf[16];
-      vst1q_s32(buf + 0, acc0);
-      vst1q_s32(buf + 4, acc1);
-      vst1q_s32(buf + 8, acc2);
-      vst1q_s32(buf + 12, acc3);
-      for (; in < input_size; in++) {
-        int lane = (in + 8 - input_size) % 4;
-        const int32 input_val = input_data[in] + input_offset;
-        for (int k = 0; k < kPeel; k++) {
-          int32 filter_val =
-              filter_data[in + (out + k) * input_size] + filter_offset;
-          buf[lane + 4 * k] += filter_val * input_val;
-        }
-      }
-      acc0 = vld1q_s32(buf + 0);
-      acc1 = vld1q_s32(buf + 4);
-      acc2 = vld1q_s32(buf + 8);
-      acc3 = vld1q_s32(buf + 12);
-    }
-
-    // Horizontally reduce accumulators
-    int32x2_t pairwise_reduced_acc_0 =
-        vpadd_s32(vget_low_s32(acc0), vget_high_s32(acc0));
-    int32x2_t pairwise_reduced_acc_1 =
-        vpadd_s32(vget_low_s32(acc1), vget_high_s32(acc1));
-    int32x2_t pairwise_reduced_acc_2 =
-        vpadd_s32(vget_low_s32(acc2), vget_high_s32(acc2));
-    int32x2_t pairwise_reduced_acc_3 =
-        vpadd_s32(vget_low_s32(acc3), vget_high_s32(acc3));
-    const int32x2_t reduced_lo =
-        vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
-    const int32x2_t reduced_hi =
-        vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
-    int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
-    // Add bias values.
-    int32x4_t bias_vec = vld1q_s32(bias_data + out);
-    reduced = vaddq_s32(reduced, bias_vec);
-    if (shift_left) {
-      const int32 multiplier_power_of_two = 1 << output_shift;
-      reduced = vmulq_n_s32(reduced, multiplier_power_of_two);
-      reduced = vqrdmulhq_n_s32(reduced, output_multiplier);
-    } else {
-      // Multiply by the fixed-point multiplier.
-      reduced = vqrdmulhq_n_s32(reduced, output_multiplier);
-      // Rounding-shift-right.
-      using gemmlowp::RoundingDivideByPOT;
-      reduced = RoundingDivideByPOT(reduced, -output_shift);
-    }
-    // Add the output offset.
-    const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
-    reduced = vaddq_s32(reduced, output_offset_vec);
-    // Narrow values down to 16 bit signed.
-    const int16x4_t res16 = vqmovn_s32(reduced);
-    // Narrow values down to 8 bit unsigned, saturating.
-    uint8x8_t res8 = vqmovun_s16(vcombine_s16(res16, res16));
-    // Apply the clamping from the activation function
-    res8 = vmax_u8(res8, vdup_n_u8(output_activation_min));
-    res8 = vmin_u8(res8, vdup_n_u8(output_activation_max));
-    // Store results to destination.
-    vst1_lane_u8(output_data + out + 0, res8, 0);
-    vst1_lane_u8(output_data + out + 1, res8, 1);
-    vst1_lane_u8(output_data + out + 2, res8, 2);
-    vst1_lane_u8(output_data + out + 3, res8, 3);
-  }
-}
-
-struct FullyConnectedAsGEMVWorkerTask : cpu_backend_threadpool::Task {
-  FullyConnectedAsGEMVWorkerTask(const RuntimeShape& input_shape,
-                                 const uint8* input_data, int32 input_offset,
-                                 const RuntimeShape& filter_shape,
-                                 const uint8* filter_data, int32 filter_offset,
-                                 const RuntimeShape& bias_shape,
-                                 const int32* bias_data, int32 output_offset,
-                                 int32 output_multiplier, int output_shift,
-                                 int32 output_activation_min,
-                                 int32 output_activation_max,
-                                 const RuntimeShape& output_shape,
-                                 uint8* output_data, int row_start, int row_end)
-      : input_shape_(input_shape),
-        input_data_(input_data),
-        input_offset_(input_offset),
-        filter_shape_(filter_shape),
-        filter_data_(filter_data),
-        filter_offset_(filter_offset),
-        bias_shape_(bias_shape),
-        bias_data_(bias_data),
-        output_offset_(output_offset),
-        output_multiplier_(output_multiplier),
-        output_shift_(output_shift),
-        output_activation_min_(output_activation_min),
-        output_activation_max_(output_activation_max),
-        output_shape_(output_shape),
-        output_data_(output_data),
-        row_start_(row_start),
-        row_end_(row_end) {}
-
-  void Run() override {
-    FullyConnectedAsGEMVWorkerImpl(
-        input_shape_, input_data_, input_offset_, filter_shape_, filter_data_,
-        filter_offset_, bias_shape_, bias_data_, output_offset_,
-        output_multiplier_, output_shift_, output_activation_min_,
-        output_activation_max_, output_shape_, output_data_, row_start_,
-        row_end_);
-  }
-
-  const RuntimeShape& input_shape_;
-  const uint8* input_data_;
-  int32 input_offset_;
-  const RuntimeShape& filter_shape_;
-  const uint8* filter_data_;
-  int32 filter_offset_;
-  const RuntimeShape& bias_shape_;
-  const int32* bias_data_;
-  int32 output_offset_;
-  int32 output_multiplier_;
-  int output_shift_;
-  int32 output_activation_min_;
-  int32 output_activation_max_;
-  const RuntimeShape& output_shape_;
-  uint8* output_data_;
-  int row_start_;
-  int row_end_;
-};
-
-inline void FullyConnectedAsGEMV(
-    const RuntimeShape& input_shape, const uint8* input_data,
-    int32 input_offset, const RuntimeShape& filter_shape,
-    const uint8* filter_data, int32 filter_offset,
-    const RuntimeShape& bias_shape, const int32* bias_data, int32 output_offset,
-    int32 output_multiplier, int output_shift, int32 output_activation_min,
-    int32 output_activation_max, const RuntimeShape& output_shape,
-    uint8* output_data, CpuBackendContext* cpu_backend_context) {
-  const int output_dim_count = output_shape.DimensionsCount();
-  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
-  const int output_rows = output_shape.Dims(output_dim_count - 1);
-  const int input_size = FlatSizeSkipDim(input_shape, 0);
-  static constexpr int kKernelRows = 4;
-  const int thread_count = gemmlowp::HowManyThreads<kKernelRows>(
-      cpu_backend_context->max_num_threads(), output_rows, batches, input_size);
-  if (thread_count == 1) {
-    // Single-thread case: do the computation on the current thread, don't
-    // use a threadpool
-    FullyConnectedAsGEMVWorkerImpl(
-        input_shape, input_data, input_offset, filter_shape, filter_data,
-        filter_offset, bias_shape, bias_data, output_offset, output_multiplier,
-        output_shift, output_activation_min, output_activation_max,
-        output_shape, output_data, 0, output_rows);
-    return;
-  }
-
-  // Multi-threaded case: use the gemmlowp context's threadpool.
-  TFLITE_DCHECK_GT(thread_count, 1);
-  std::vector<FullyConnectedAsGEMVWorkerTask> tasks;
-  // TODO(b/131746020) don't create new heap allocations every time.
-  // At least we make it a single heap allocation by using reserve().
-  tasks.reserve(thread_count);
-  const int kRowsPerWorker = gemmlowp::RoundUp<kKernelRows>(
-      gemmlowp::CeilQuotient(output_rows, thread_count));
-  int row_start = 0;
-  for (int i = 0; i < thread_count; ++i) {
-    int row_end = std::min(output_rows, row_start + kRowsPerWorker);
-    tasks.emplace_back(input_shape, input_data, input_offset, filter_shape,
-                       filter_data, filter_offset, bias_shape, bias_data,
-                       output_offset, output_multiplier, output_shift,
-                       output_activation_min, output_activation_max,
-                       output_shape, output_data, row_start, row_end);
-    row_start = row_end;
-  }
-  TFLITE_DCHECK_EQ(row_start, output_rows);
-  cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
-                                  cpu_backend_context);
-}
-#endif  // USE_NEON
-
-struct GemmlowpOutputPipeline {
-  typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col>
-      ColVectorMap;
-  typedef std::tuple<gemmlowp::OutputStageBiasAddition<ColVectorMap>,
-                     gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent,
-                     gemmlowp::OutputStageClamp,
-                     gemmlowp::OutputStageSaturatingCastToUint8>
-      Pipeline;
-  static Pipeline MakeExp(const int32* bias_data, int output_rows,
-                          int32 output_offset, int32 output_multiplier,
-                          int output_left_shift, int32 output_activation_min,
-                          int32 output_activation_max) {
-    ColVectorMap bias_vector(bias_data, output_rows);
-    gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
-    bias_addition_stage.bias_vector = bias_vector;
-    gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent quantize_down_stage;
-    quantize_down_stage.result_offset_after_shift = output_offset;
-    quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
-    quantize_down_stage.result_exponent = output_left_shift;
-    gemmlowp::OutputStageClamp clamp_stage;
-    clamp_stage.min = output_activation_min;
-    clamp_stage.max = output_activation_max;
-    gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
-    return std::make_tuple(bias_addition_stage, quantize_down_stage,
-                           clamp_stage, saturating_cast_stage);
-  }
-};
-
 inline void FullyConnected(
     const FullyConnectedParams& params, const RuntimeShape& input_shape,
     const uint8* input_data, const RuntimeShape& filter_shape,
@@ -1201,20 +263,6 @@ inline void FullyConnected(
   const int output_dim_count = output_shape.DimensionsCount();
   const int filter_dim_count = filter_shape.DimensionsCount();
   const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
-#ifdef USE_NEON
-  if (batches == 1) {
-    const int output_size = MatchingDim(filter_shape, filter_dim_count - 2,
-                                        output_shape, output_dim_count - 1);
-    if (output_size >= 4) {
-      return FullyConnectedAsGEMV(
-          input_shape, input_data, input_offset, filter_shape, filter_data,
-          filter_offset, bias_shape, bias_data, output_offset,
-          output_multiplier, output_shift, output_activation_min,
-          output_activation_max, output_shape, output_data,
-          cpu_backend_context);
-    }
-  }
-#endif  // USE_NEON
   const int filter_rows = filter_shape.Dims(filter_dim_count - 2);
   const int filter_cols = filter_shape.Dims(filter_dim_count - 1);
   TFLITE_DCHECK_EQ(filter_shape.FlatSize(), filter_rows * filter_cols);
@@ -1279,31 +327,6 @@ inline void FullyConnected(
                                        output_shape, output_dim_count - 1);
   const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
 
-  // Implementation of the fully connected node suited to the inside of an LSTM
-  // cell. The operands are 8-bit integers, the accumulators are internally
-  // 32bit integers, and the output is 16-bit fixed-point with 3 integer bits so
-  // the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that
-  // is explained in the function comment above.
-#ifdef GEMMLOWP_NEON
-  if (batches == 1 && input_offset == -128 && output_activation_min == -32768 &&
-      output_activation_max == 32767) {
-    if (filter_offset == -128 && !(output_depth % 4) && !(accum_depth % 64)) {
-      GEMVForLstmCellWithSymmetricRange(
-          input_shape, input_data, filter_shape, filter_data, bias_shape,
-          bias_data_int32, output_multiplier, output_shift, output_shape,
-          output_data);
-      return;
-    }
-    if (!(output_depth % 4) && !(accum_depth % 8)) {
-      GEMVForLstmCell(input_shape, input_data, filter_shape, filter_data,
-                      filter_offset, bias_shape, bias_data_int32,
-                      output_multiplier, output_shift, output_shape,
-                      output_data);
-      return;
-    }
-  }
-#endif
-
   cpu_backend_gemm::MatrixParams<uint8> lhs_params;
   lhs_params.rows = output_depth;
   lhs_params.cols = accum_depth;
@@ -1742,9 +765,9 @@ inline void ShuffledFullyConnected(
   }
 
   static constexpr int kKernelRows = 4;
-  const int thread_count = gemmlowp::HowManyThreads<kKernelRows>(
-      cpu_backend_context->max_num_threads(), output_depth, batches,
-      accum_depth);
+  const int thread_count =
+      LegacyHowManyThreads<kKernelRows>(cpu_backend_context->max_num_threads(),
+                                        output_depth, batches, accum_depth);
   if (thread_count == 1) {
     // Single-thread case: do the computation on the current thread, don't
     // use a threadpool
@@ -1761,8 +784,8 @@ inline void ShuffledFullyConnected(
   // TODO(b/131746020) don't create new heap allocations every time.
   // At least we make it a single heap allocation by using reserve().
   tasks.reserve(thread_count);
-  const int kRowsPerWorker = gemmlowp::RoundUp<kKernelRows>(
-      gemmlowp::CeilQuotient(output_depth, thread_count));
+  const int kRowsPerWorker =
+      RoundUp<kKernelRows>(CeilQuotient(output_depth, thread_count));
   int row_start = 0;
   for (int i = 0; i < thread_count; i++) {
     int row_end = std::min(output_depth, row_start + kRowsPerWorker);
@@ -2045,6 +1068,12 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
     gemm_input_shape = &input_shape;
   }
 
+  const int gemm_input_dims = gemm_input_shape->DimensionsCount();
+  int m = FlatSizeSkipDim(*gemm_input_shape, gemm_input_dims - 1);
+  int n = output_shape.Dims(3);
+  int k = gemm_input_shape->Dims(gemm_input_dims - 1);
+
+#if defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
   // The following code computes matrix multiplication c = a * transponse(b)
   // with CBLAS, where:
   // * `a` is a matrix with dimensions (m, k).
@@ -2054,12 +1083,6 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
   const float* a = gemm_input_data;
   const float* b = filter_data;
   float* c = output_data;
-  const int gemm_input_dims = gemm_input_shape->DimensionsCount();
-  int m = FlatSizeSkipDim(*gemm_input_shape, gemm_input_dims - 1);
-  int n = output_shape.Dims(3);
-  int k = gemm_input_shape->Dims(gemm_input_dims - 1);
-
-#if defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
   // The stride of matrix a, b and c respectively.
   int stride_a = k;
   int stride_b = k;
@@ -2067,36 +1090,32 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
 
   cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, n, k, 1.0f, a,
               stride_a, b, stride_b, 0.0f, c, stride_c);
-#else
-  // When an optimized CBLAS implementation is not available, fall back
-  // to using Eigen.
-  typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>
-      Matrix;
-  typedef Eigen::Map<Matrix> MatrixRef;
-  typedef Eigen::Map<const Matrix> ConstMatrixRef;
-
-  MatrixRef matrix_c(c, m, n);
-  ConstMatrixRef matrix_a(a, m, k);
-  ConstMatrixRef matrix_b(b, n, k);
-
-  // The following special casing for when a or b is a vector is required
-  // as Eigen seem to fail to make this optimization on its own.
-  if (n == 1) {
-    gemmlowp::ScopedProfilingLabel label("GEMV");
-    matrix_c.col(0).noalias() = matrix_a * matrix_b.row(0).transpose();
-  } else if (m == 1) {
-    gemmlowp::ScopedProfilingLabel label("GEMV");
-    matrix_c.row(0).noalias() = matrix_a.row(0) * matrix_b.transpose();
-  } else {
-    gemmlowp::ScopedProfilingLabel label("GEMM");
-    matrix_c.noalias() = matrix_a * matrix_b.transpose();
-  }
-
-#endif  //  defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
-
   optimized_ops::AddBiasAndEvalActivationFunction(
       output_activation_min, output_activation_max, bias_shape, bias_data,
       output_shape, output_data);
+#else
+  // When an optimized CBLAS implementation is not available, fall back
+  // to using cpu_backend_gemm.
+  cpu_backend_gemm::MatrixParams<float> lhs_params;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.rows = n;
+  lhs_params.cols = k;
+  cpu_backend_gemm::MatrixParams<float> rhs_params;
+  rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  rhs_params.rows = k;
+  rhs_params.cols = m;
+  cpu_backend_gemm::MatrixParams<float> dst_params;
+  dst_params.order = cpu_backend_gemm::Order::kColMajor;
+  dst_params.rows = n;
+  dst_params.cols = m;
+  cpu_backend_gemm::GemmParams<float, float> gemm_params;
+  gemm_params.bias = bias_data;
+  gemm_params.clamp_min = output_activation_min;
+  gemm_params.clamp_max = output_activation_max;
+  cpu_backend_gemm::Gemm(lhs_params, filter_data, rhs_params, gemm_input_data,
+                         dst_params, output_data, gemm_params,
+                         cpu_backend_context);
+#endif  //  defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
 }
 
 inline void HybridConv(const ConvParams& params, float* scaling_factors_ptr,
@@ -2256,20 +1275,6 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
   TFLITE_DCHECK_EQ(filter_cols, gemm_input_rows);
   TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows);
 
-#ifdef USE_NEON
-  if (gemm_input_cols == 1 && output_rows >= 4) {
-    RuntimeShape fc_filter_shape{
-        filter_shape.Dims(0),
-        filter_shape.Dims(filter_shape.DimensionsCount() - 1)};
-
-    return FullyConnectedAsGEMV(
-        *gemm_input_shape, gemm_input_data, input_offset, fc_filter_shape,
-        filter_data, filter_offset, bias_shape, bias_data, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_shape, output_data, cpu_backend_context);
-  }
-#endif
-
   cpu_backend_gemm::MatrixParams<uint8> lhs_params;
   lhs_params.rows = filter_rows;
   lhs_params.cols = filter_cols;
@@ -3592,40 +2597,28 @@ inline void LstmCell(
   // integers, and the output is 16-bit fixed-point with 3 integer bits so
   // the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that
   // is explained in the function comment above.
-  bool gemm_already_performed = false;
-#ifdef GEMMLOWP_NEON
-  if (fc_batches == 1 && !(fc_output_depth % 4) && !(fc_accum_depth % 8)) {
-    GEMVForLstmCell(concat_temp_shape, concat_temp_data_uint8, weights_shape,
-                    weights_data_uint8, weights_zero_point, bias_shape,
-                    bias_data_int32, accum_multiplier, accum_shift,
-                    activ_temp_shape, activ_temp_data_int16);
-    gemm_already_performed = true;
-  }
-#endif
-  if (!gemm_already_performed) {
-    cpu_backend_gemm::MatrixParams<uint8> lhs_params;
-    lhs_params.rows = fc_output_depth;
-    lhs_params.cols = fc_accum_depth;
-    lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
-    lhs_params.zero_point = weights_zero_point;
-    cpu_backend_gemm::MatrixParams<uint8> rhs_params;
-    rhs_params.rows = fc_accum_depth;
-    rhs_params.cols = fc_batches;
-    rhs_params.order = cpu_backend_gemm::Order::kColMajor;
-    rhs_params.zero_point = 128;
-    cpu_backend_gemm::MatrixParams<int16> dst_params;
-    dst_params.rows = fc_output_depth;
-    dst_params.cols = fc_batches;
-    dst_params.order = cpu_backend_gemm::Order::kColMajor;
-    dst_params.zero_point = 0;
-    cpu_backend_gemm::GemmParams<int32, int16> gemm_params;
-    gemm_params.bias = bias_data_int32;
-    gemm_params.multiplier_fixedpoint = accum_multiplier;
-    gemm_params.multiplier_exponent = accum_shift;
-    cpu_backend_gemm::Gemm(
-        lhs_params, weights_data_uint8, rhs_params, concat_temp_data_uint8,
-        dst_params, activ_temp_data_int16, gemm_params, cpu_backend_context);
-  }
+  cpu_backend_gemm::MatrixParams<uint8> lhs_params;
+  lhs_params.rows = fc_output_depth;
+  lhs_params.cols = fc_accum_depth;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.zero_point = weights_zero_point;
+  cpu_backend_gemm::MatrixParams<uint8> rhs_params;
+  rhs_params.rows = fc_accum_depth;
+  rhs_params.cols = fc_batches;
+  rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  rhs_params.zero_point = 128;
+  cpu_backend_gemm::MatrixParams<int16> dst_params;
+  dst_params.rows = fc_output_depth;
+  dst_params.cols = fc_batches;
+  dst_params.order = cpu_backend_gemm::Order::kColMajor;
+  dst_params.zero_point = 0;
+  cpu_backend_gemm::GemmParams<int32, int16> gemm_params;
+  gemm_params.bias = bias_data_int32;
+  gemm_params.multiplier_fixedpoint = accum_multiplier;
+  gemm_params.multiplier_exponent = accum_shift;
+  cpu_backend_gemm::Gemm(
+      lhs_params, weights_data_uint8, rhs_params, concat_temp_data_uint8,
+      dst_params, activ_temp_data_int16, gemm_params, cpu_backend_context);
 
   // Rest of the LSTM cell: tanh and logistic math functions, and some adds
   // and muls, all done in 16-bit fixed-point.
@@ -6354,7 +5347,8 @@ inline void TransposeConvV2(
     const ConvParams& params, const RuntimeShape& input_shape,
     const float* input_data, const RuntimeShape& hwoi_ordered_filter_shape,
     const float* hwoi_ordered_filter_data, const RuntimeShape& output_shape,
-    float* output_data, const RuntimeShape& col2im_shape, float* col2im_data) {
+    float* output_data, const RuntimeShape& col2im_shape, float* col2im_data,
+    CpuBackendContext* cpu_backend_context) {
   gemmlowp::ScopedProfilingLabel label("TransposeConvV2");
   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_EQ(hwoi_ordered_filter_shape.DimensionsCount(), 4);
@@ -6387,21 +5381,25 @@ inline void TransposeConvV2(
   const int hwoi_ordered_filter_total_size =
       filter_height * filter_width * output_depth;
 
-  typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>
-      Matrix;
-  typedef Eigen::Map<Matrix> MatrixRef;
-  typedef Eigen::Map<const Matrix> ConstMatrixRef;
-  ConstMatrixRef hwoi_ordered_filter_matrix_map(
-      hwoi_ordered_filter_data, hwoi_ordered_filter_total_size, input_depth);
+  cpu_backend_gemm::MatrixParams<float> lhs_params;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.rows = hwoi_ordered_filter_total_size;
+  lhs_params.cols = input_depth;
   float* output_data_p = output_data;
   tensor_utils::ZeroVector(output_data, output_offset * batch_size);
   for (int i = 0; i < batch_size; ++i) {
-    ConstMatrixRef input_matrix_map(input_data + input_offset * i,
-                                    input_image_size, input_depth);
-    MatrixRef output_matrix_map(col2im_data, input_image_size,
-                                hwoi_ordered_filter_total_size);
-    Gemm(input_matrix_map, hwoi_ordered_filter_matrix_map.transpose(),
-         &output_matrix_map);
+    cpu_backend_gemm::MatrixParams<float> rhs_params;
+    rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+    rhs_params.rows = input_depth;
+    rhs_params.cols = input_image_size;
+    cpu_backend_gemm::MatrixParams<float> dst_params;
+    dst_params.order = cpu_backend_gemm::Order::kColMajor;
+    dst_params.rows = hwoi_ordered_filter_total_size;
+    dst_params.cols = input_image_size;
+    cpu_backend_gemm::GemmParams<float, float> gemm_params;
+    cpu_backend_gemm::Gemm(lhs_params, hwoi_ordered_filter_data, rhs_params,
+                           input_data + input_offset * i, dst_params,
+                           col2im_data, gemm_params, cpu_backend_context);
 
     Col2im(col2im_data, output_depth, output_height, output_width,
            filter_height, filter_width, padding_top, padding_left,
@@ -6411,29 +5409,6 @@ inline void TransposeConvV2(
   }
 }
 
-// TODO(renjieliu): Investigate whether we need to keep this.
-inline void TransposeConv(
-    const ConvParams& params, const RuntimeShape& input_shape,
-    const float* input_data, const RuntimeShape& filter_shape,
-    const float* filter_data, const RuntimeShape& output_shape,
-    float* output_data, const RuntimeShape& im2col_shape, float* im2col_data) {
-  gemmlowp::ScopedProfilingLabel label("TransposeConv");
-  // Note we could use transposed weights with forward conv for unstrided
-  // cases. But we are already getting good performance with this code as-is.
-  TFLITE_DCHECK(im2col_data);
-  TransposeIm2col(params, 0, input_shape, input_data, filter_shape,
-                  output_shape, im2col_data);
-
-  const auto im2col_matrix_map =
-      MapAsMatrixWithLastDimAsRows(im2col_data, im2col_shape);
-  const auto filter_matrix_map =
-      MapAsMatrixWithFirstDimAsCols(filter_data, filter_shape);
-  auto output_matrix_map =
-      MapAsMatrixWithLastDimAsRows(output_data, output_shape);
-
-  Gemm(filter_matrix_map.transpose(), im2col_matrix_map, &output_matrix_map);
-}
-
 // Integer-only version of ResizeNearestNeighbor. Since scales are represented
 // in fixed-point and thus approximated, |in_x| or |in_y| may differ from the
 // reference version. Debug checks are in place to test if this occurs.
diff --git a/tensorflow/lite/kernels/internal/reference/fully_connected.h b/tensorflow/lite/kernels/internal/reference/fully_connected.h
index 1f62e3b3068..51c1deff969 100644
--- a/tensorflow/lite/kernels/internal/reference/fully_connected.h
+++ b/tensorflow/lite/kernels/internal/reference/fully_connected.h
@@ -15,7 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
 
-#include "fixedpoint/fixedpoint.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/round.h"
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/add.h b/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
index a694ba2aaa9..e3138e86b1f 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
@@ -16,7 +16,8 @@ limitations under the License.
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
 
 #include <limits>
-#include "public/gemmlowp.h"
+
+#include "profiling/instrumentation.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h b/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
index b424a3ef170..737e9d2263c 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
 
-#include "public/gemmlowp.h"
+#include "profiling/instrumentation.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h b/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
index 5e33d089945..dad17fb7f4a 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
@@ -15,7 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MUL_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MUL_H_
 
-#include "public/gemmlowp.h"
+#include "fixedpoint/fixedpoint.h"
+#include "profiling/instrumentation.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h b/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
index 081928bc88d..cc704387f38 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
@@ -16,6 +16,8 @@ limitations under the License.
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
 
 #include <limits>
+
+#include "fixedpoint/fixedpoint.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/internal/reference/pooling.h b/tensorflow/lite/kernels/internal/reference/pooling.h
index 847fac77df1..2cb23472f29 100644
--- a/tensorflow/lite/kernels/internal/reference/pooling.h
+++ b/tensorflow/lite/kernels/internal/reference/pooling.h
@@ -15,7 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_POOLING_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_POOLING_H_
 
-#include "fixedpoint/fixedpoint.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/round.h"
diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index 1594a0a1199..8488f7ae266 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -27,7 +27,7 @@ limitations under the License.
 #include <type_traits>
 
 #include "fixedpoint/fixedpoint.h"
-#include "public/gemmlowp.h"
+#include "profiling/instrumentation.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
@@ -1578,7 +1578,7 @@ inline void ConcatenationWithScaling(const ConcatenationParams& params,
         const float bias = -input_zeropoint[i] * scale;
         for (int j = 0; j < copy_size; ++j) {
           const int32_t value =
-              static_cast<int32_t>(round(input_ptr[j] * scale + bias)) +
+              static_cast<int32_t>(std::round(input_ptr[j] * scale + bias)) +
               output_zeropoint;
           output_ptr[j] =
               static_cast<uint8_t>(std::max(std::min(255, value), 0));
@@ -1689,7 +1689,7 @@ void PackWithScaling(const PackParams& params,
         auto input_ptr = input_data[i];
         for (int j = 0; j < copy_size; ++j) {
           const int32_t value =
-              static_cast<int32_t>(round(input_ptr[j] * scale + bias)) +
+              static_cast<int32_t>(std::round(input_ptr[j] * scale + bias)) +
               output_zeropoint;
           output_ptr[j] =
               static_cast<uint8_t>(std::max(std::min(255, value), 0));
@@ -1914,23 +1914,25 @@ inline void LstmCell(
 // aiming for 16-bit fixed-point quantization of these internal nodes here.
 //
 template <int StateIntegerBits>
-inline void LstmCell(
-    const LstmCellParams& params, const RuntimeShape& unextended_input_shape,
-    const uint8* input_data_uint8,
-    const RuntimeShape& unextended_prev_activ_shape,
-    const uint8* prev_activ_data_uint8, const RuntimeShape& weights_shape,
-    const uint8* weights_data_uint8, const RuntimeShape& unextended_bias_shape,
-    const int32* bias_data_int32,
-    const RuntimeShape& unextended_prev_state_shape,
-    const int16* prev_state_data_int16,
-    const RuntimeShape& unextended_output_state_shape,
-    int16* output_state_data_int16,
-    const RuntimeShape& unextended_output_activ_shape,
-    uint8* output_activ_data_uint8,
-    const RuntimeShape& unextended_concat_temp_shape,
-    uint8* concat_temp_data_uint8,
-    const RuntimeShape& unextended_activ_temp_shape,
-    int16* activ_temp_data_int16, gemmlowp::GemmContext* gemmlowp_context) {
+inline void LstmCell(const LstmCellParams& params,
+                     const RuntimeShape& unextended_input_shape,
+                     const uint8* input_data_uint8,
+                     const RuntimeShape& unextended_prev_activ_shape,
+                     const uint8* prev_activ_data_uint8,
+                     const RuntimeShape& weights_shape,
+                     const uint8* weights_data_uint8,
+                     const RuntimeShape& unextended_bias_shape,
+                     const int32* bias_data_int32,
+                     const RuntimeShape& unextended_prev_state_shape,
+                     const int16* prev_state_data_int16,
+                     const RuntimeShape& unextended_output_state_shape,
+                     int16* output_state_data_int16,
+                     const RuntimeShape& unextended_output_activ_shape,
+                     uint8* output_activ_data_uint8,
+                     const RuntimeShape& unextended_concat_temp_shape,
+                     uint8* concat_temp_data_uint8,
+                     const RuntimeShape& unextended_activ_temp_shape,
+                     int16* activ_temp_data_int16, void* gemmlowp_context) {
   (void)gemmlowp_context;  // only used in optimized code.
   int32 weights_zero_point = params.weights_zero_point;
   int32 accum_multiplier = params.accum_multiplier;
@@ -3149,7 +3151,7 @@ inline void Exp(const T* input_data, const size_t num_elements,
                 T* output_data) {
   gemmlowp::ScopedProfilingLabel label("Exp");
   for (size_t idx = 0; idx < num_elements; ++idx) {
-    output_data[idx] = exp(input_data[idx]);
+    output_data[idx] = std::exp(input_data[idx]);
   }
 }
 
@@ -3420,10 +3422,10 @@ inline void Mean(const tflite::MeanParams& op_params,
       temp_value = temp_value / num_elements_in_axis;
       if (ordinary_mean) {
         output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
-            static_cast<uint8_t>(round(temp_value));
+            static_cast<uint8_t>(std::round(temp_value));
       } else {
         output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
-            static_cast<uint8_t>(round(temp_value * scale + bias)) +
+            static_cast<uint8_t>(std::round(temp_value * scale + bias)) +
             output_zero_point;
       }
     }
@@ -3490,8 +3492,9 @@ inline bool QuantizedMeanOrSum(const T* input_data, int32 input_zero_point,
       // TODO(b/116341117): Eliminate float and do this completely in 8bit.
       const float bias = -input_zero_point * scale * num_elements_in_axis + 0.5;
       for (size_t idx = 0; idx < num_outputs; ++idx) {
-        const U value = static_cast<U>(round(temp_sum[idx] * scale + bias)) +
-                        output_zero_point;
+        const U value =
+            static_cast<U>(std::round(temp_sum[idx] * scale + bias)) +
+            output_zero_point;
         output_data[idx] = static_cast<T>(value);
       }
     } else {
@@ -3501,8 +3504,9 @@ inline bool QuantizedMeanOrSum(const T* input_data, int32 input_zero_point,
                            static_cast<float>(num_elements_in_axis);
 
         // Convert to float value.
-        output_data[idx] = static_cast<T>(round(float_mean * scale + bias)) +
-                           output_zero_point;
+        output_data[idx] =
+            static_cast<T>(std::round(float_mean * scale + bias)) +
+            output_zero_point;
       }
     }
   }
diff --git a/tensorflow/lite/kernels/internal/tensor_ctypes.h b/tensorflow/lite/kernels/internal/tensor_ctypes.h
index f77fae251d8..8ee95d4d5b3 100644
--- a/tensorflow/lite/kernels/internal/tensor_ctypes.h
+++ b/tensorflow/lite/kernels/internal/tensor_ctypes.h
@@ -66,6 +66,11 @@ inline const float* GetTensorData(const TfLiteTensor* tensor) {
   return tensor != nullptr ? tensor->data.f : nullptr;
 }
 
+template <>
+inline const TfLiteFloat16* GetTensorData(const TfLiteTensor* tensor) {
+  return tensor != nullptr ? tensor->data.f16 : nullptr;
+}
+
 template <>
 inline const uint8_t* GetTensorData(const TfLiteTensor* tensor) {
   return tensor != nullptr ? tensor->data.uint8 : nullptr;
diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index 2d0aee19ecb..a518daf2cfd 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -16,6 +16,12 @@ limitations under the License.
 
 #include <cstdint>
 
+#ifdef GEMMLOWP_PROFILING
+#include "profiling/profiler.h"
+#endif
+
+#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/kernel_utils.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
@@ -119,6 +125,9 @@ inline void LstmStepWithAuxInput(
     float* output_state_ptr, float* cell_state_ptr, float* input_gate_scratch,
     float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch,
     float* output_ptr_batch) {
+#ifdef GEMMLOWP_PROFILING
+  gemmlowp::ScopedProfilingLabel label("LstmStepWithAuxInputFloat");
+#endif
   // Since we have already checked that weights are all there or none, we can
   // check the existence of only one to the get the condition.
   const bool use_cifg = (input_to_input_weights_ptr == nullptr);
@@ -362,6 +371,28 @@ inline void LstmStepWithAuxInput(
   }
 }
 
+void ApplyActivationsToVector(float* input, int input_size,
+                              TfLiteFusedActivation activation_type,
+                              float* output) {
+  using VectorMap = Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, 1>>;
+  VectorMap input_map(input, input_size, 1);
+  VectorMap output_map(output, input_size, 1);
+  switch (activation_type) {
+    case kTfLiteActSigmoid: {
+      output_map.array() = input_map.array().logistic();
+      break;
+    }
+    case kTfLiteActTanh: {
+      output_map.array() = input_map.array().tanh();
+      break;
+    }
+    default: {
+      tensor_utils::ApplyActivationToVector(input, input_size, activation_type,
+                                            output);
+    }
+  }
+}
+
 // Same as above but with quantized weight matrices. In detail:
 // Input of size 'n_batch * n_input':
 //   input_ptr_batch
@@ -473,6 +504,9 @@ inline void LstmStepWithAuxInput(
     int8_t* quantized_aux_input_ptr_batch, int8_t* quantized_output_state_ptr,
     int8_t* quantized_cell_state_ptr, float* output_state_ptr,
     float* cell_state_ptr, float* output_ptr_batch) {
+#ifdef GEMMLOWP_PROFILING
+  gemmlowp::ScopedProfilingLabel label("LstmStepWithAuxInputHybrid");
+#endif
   // Since we have already checked that weights are all there or none, we
   // can check the existence of only one to the get the condition.
   const bool use_cifg = (input_to_input_weights_ptr == nullptr);
@@ -674,8 +708,8 @@ inline void LstmStepWithAuxInput(
       tensor_utils::VectorBatchVectorAdd(input_gate_bias_ptr, n_cell, n_batch,
                                          input_gate_scratch);
     }
-    tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
-                                       input_gate_scratch);
+    ApplyActivationsToVector(input_gate_scratch, n_cell * n_batch,
+                             kTfLiteActSigmoid, input_gate_scratch);
   }
 
   // For each batch and cell: update forget gate.
@@ -697,8 +731,8 @@ inline void LstmStepWithAuxInput(
     tensor_utils::VectorBatchVectorAdd(forget_gate_bias_ptr, n_cell, n_batch,
                                        forget_gate_scratch);
   }
-  tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
-                                     forget_gate_scratch);
+  ApplyActivationsToVector(forget_gate_scratch, n_cell * n_batch,
+                           kTfLiteActSigmoid, forget_gate_scratch);
 
   // For each batch and cell: update the cell.
   tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr,
@@ -712,8 +746,8 @@ inline void LstmStepWithAuxInput(
     tensor_utils::VectorBatchVectorAdd(cell_bias_ptr, n_cell, n_batch,
                                        cell_scratch);
   }
-  tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell,
-                                        params->activation, cell_scratch);
+  ApplyActivationsToVector(cell_scratch, n_batch * n_cell, params->activation,
+                           cell_scratch);
   if (use_cifg) {
     tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
                              forget_gate_scratch);
@@ -749,10 +783,10 @@ inline void LstmStepWithAuxInput(
     tensor_utils::VectorBatchVectorAdd(output_gate_bias_ptr, n_cell, n_batch,
                                        output_gate_scratch);
   }
-  tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
-                                     output_gate_scratch);
-  tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell,
-                                        params->activation, cell_scratch);
+  ApplyActivationsToVector(output_gate_scratch, n_batch * n_cell,
+                           kTfLiteActSigmoid, output_gate_scratch);
+  ApplyActivationsToVector(cell_state_ptr, n_batch * n_cell, params->activation,
+                           cell_scratch);
   tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch,
                                          n_batch * n_cell, output_gate_scratch);
 
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index fb9807b7fa9..d8a6d5d3051 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -369,7 +369,9 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_LOGICAL_OR, Register_LOGICAL_OR());
   AddBuiltin(BuiltinOperator_LOGICAL_AND, Register_LOGICAL_AND());
   AddBuiltin(BuiltinOperator_LOGICAL_NOT, Register_LOGICAL_NOT());
-  AddBuiltin(BuiltinOperator_UNPACK, Register_UNPACK());
+  AddBuiltin(BuiltinOperator_UNPACK, Register_UNPACK(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_FLOOR_DIV, Register_FLOOR_DIV());
   AddBuiltin(BuiltinOperator_SQUARE, Register_SQUARE());
   AddBuiltin(BuiltinOperator_ZEROS_LIKE, Register_ZEROS_LIKE());
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index dfc5783422a..44f8aa317e2 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -20,7 +20,6 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
@@ -568,6 +567,7 @@ class SingleOpTest : public ::testing::TestWithParam<string> {
 template <typename T>
 TensorType GetTensorType() {
   if (std::is_same<T, float>::value) return TensorType_FLOAT32;
+  if (std::is_same<T, TfLiteFloat16>::value) return TensorType_FLOAT16;
   if (std::is_same<T, int32_t>::value) return TensorType_INT32;
   if (std::is_same<T, uint8_t>::value) return TensorType_UINT8;
   if (std::is_same<T, string>::value) return TensorType_STRING;
diff --git a/tensorflow/lite/kernels/transpose_conv.cc b/tensorflow/lite/kernels/transpose_conv.cc
index 094b8c5a05c..8bca828a1d9 100644
--- a/tensorflow/lite/kernels/transpose_conv.cc
+++ b/tensorflow/lite/kernels/transpose_conv.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/cpu_backend_support.h"
 #include "tensorflow/lite/kernels/eigen_support.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
@@ -85,11 +86,13 @@ struct OpData {
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   auto* data = new OpData;
   eigen_support::IncrementUsageCounter(context);
+  cpu_backend_support::IncrementUsageCounter(context);
   return data;
 }
 
 void Free(TfLiteContext* context, void* buffer) {
   eigen_support::DecrementUsageCounter(context);
+  cpu_backend_support::DecrementUsageCounter(context);
   delete reinterpret_cast<OpData*>(buffer);
 }
 
@@ -306,8 +309,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 template <KernelType kernel_type>
-void EvalFloat(const TfLiteTransposeConvParams* params, const OpData* data,
-               const TfLiteTensor* input, const TfLiteTensor* weights,
+void EvalFloat(TfLiteContext* context, const TfLiteTransposeConvParams* params,
+               const OpData* data, const TfLiteTensor* input,
+               const TfLiteTensor* weights,
                const TfLiteTensor* transposed_weights, TfLiteTensor* col2im,
                TfLiteTensor* output) {
   tflite::ConvParams op_params;
@@ -333,7 +337,8 @@ void EvalFloat(const TfLiteTransposeConvParams* params, const OpData* data,
           GetTensorShape(transposed_weights),
           GetTensorData<float>(transposed_weights), GetTensorShape(output),
           GetTensorData<float>(output), GetTensorShape(col2im),
-          GetTensorData<float>(col2im));
+          GetTensorData<float>(col2im),
+          cpu_backend_support::GetFromContext(context));
       break;
     }
   }
@@ -419,8 +424,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           ResizeAndTransposeWeights(context, weights, transposed_weights);
         }
       }
-      EvalFloat<kernel_type>(params, data, input, weights, transposed_weights,
-                             col2im, output);
+      EvalFloat<kernel_type>(context, params, data, input, weights,
+                             transposed_weights, col2im, output);
       break;
     }
     case kTfLiteUInt8: {
diff --git a/tensorflow/lite/kernels/unpack.cc b/tensorflow/lite/kernels/unpack.cc
index eed69ee7e53..3af2e969a7b 100644
--- a/tensorflow/lite/kernels/unpack.cc
+++ b/tensorflow/lite/kernels/unpack.cc
@@ -42,9 +42,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     axis += NumDimensions(input);
   }
   TF_LITE_ENSURE(context, 0 <= axis && axis < NumDimensions(input));
-  if (input->type != kTfLiteInt32 && input->type != kTfLiteFloat32) {
-    context->ReportError(context,
-                         "Currently pack only supports int32 and float32.");
+  if (input->type != kTfLiteInt32 && input->type != kTfLiteFloat32 &&
+      input->type != kTfLiteUInt8 && input->type != kTfLiteInt8) {
+    context->ReportError(context, "Type '%s' is not supported by unpack.",
+                         TfLiteTypeGetName(input->type));
     return kTfLiteError;
   }
 
@@ -64,6 +65,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     TfLiteIntArray* copied_output_shape = TfLiteIntArrayCopy(output_shape);
     TfLiteTensor* output = GetOutput(context, node, i);
     TF_LITE_ENSURE_EQ(context, output->type, input->type);
+    // Guarantee input/output quantization params match as we do not support
+    // rescaling of unpacked quantized tensors.
+    TF_LITE_ENSURE_EQ(context, input->params.zero_point,
+                      output->params.zero_point);
+    TF_LITE_ENSURE_EQ(context, input->params.scale, output->params.scale);
     TF_LITE_ENSURE_OK(
         context, context->ResizeTensor(context, output, copied_output_shape));
   }
@@ -98,9 +104,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       UnpackImpl<int32_t>(context, node, input, data->num, data->axis);
       break;
     }
+    case kTfLiteUInt8: {
+      UnpackImpl<uint8_t>(context, node, input, data->num, data->axis);
+      break;
+    }
+    case kTfLiteInt8: {
+      UnpackImpl<int8_t>(context, node, input, data->num, data->axis);
+      break;
+    }
     default: {
-      context->ReportError(context,
-                           "Currently pack only supports int32 and float32.");
+      context->ReportError(context, "Type '%s' is not supported by unpack.",
+                           TfLiteTypeGetName(input->type));
       return kTfLiteError;
     }
   }
diff --git a/tensorflow/lite/kernels/unpack_test.cc b/tensorflow/lite/kernels/unpack_test.cc
index 365970d683e..487fc95ea88 100644
--- a/tensorflow/lite/kernels/unpack_test.cc
+++ b/tensorflow/lite/kernels/unpack_test.cc
@@ -159,6 +159,104 @@ TEST(UnpackOpTest, IntThreeDimensionsOutputs) {
                  /*type=*/TensorType_INT32);
 }
 
+// uint8 tests.
+TEST(UnpackOpTest, Uint8ThreeOutputs) {
+  Check<uint8_t>(/*axis=*/0, /*input_shape=*/{3, 2},
+                 /*input_data=*/{1, 2, 3, 4, 5, 6},
+                 /*expected_output_shape=*/{{2}, {2}, {2}},
+                 /*expected_output_data=*/{{1, 2}, {3, 4}, {5, 6}},
+                 /*type=*/TensorType_UINT8);
+}
+
+TEST(UnpackOpTest, Uint8ThreeOutputsAxisOne) {
+  Check<uint8_t>(/*axis=*/1, /*input_shape=*/{3, 2},
+                 /*input_data=*/{1, 2, 3, 4, 5, 6},
+                 /*expected_output_shape=*/{{3}, {3}},
+                 /*expected_output_data=*/{{1, 3, 5}, {2, 4, 6}},
+                 /*type=*/TensorType_UINT8);
+}
+
+TEST(UnpackOpTest, Uint8ThreeOutputsNegativeAxisOne) {
+  Check<uint8_t>(/*axis=*/-1, /*input_shape=*/{3, 2},
+                 /*input_data=*/{1, 2, 3, 4, 5, 6},
+                 /*expected_output_shape=*/{{3}, {3}},
+                 /*expected_output_data=*/{{1, 3, 5}, {2, 4, 6}},
+                 /*type=*/TensorType_UINT8);
+}
+
+TEST(UnpackOpTest, Uint8ThreeOutputsNegativeAxisTwo) {
+  Check<uint8_t>(/*axis=*/-2, /*input_shape=*/{3, 2},
+                 /*input_data=*/{1, 2, 3, 4, 5, 6},
+                 /*expected_output_shape=*/{{2}, {2}, {2}},
+                 /*expected_output_data=*/{{1, 2}, {3, 4}, {5, 6}},
+                 /*type=*/TensorType_UINT8);
+}
+
+TEST(UnpackOpTest, Uint8OneOutput) {
+  Check<uint8_t>(/*axis=*/0, /*input_shape=*/{1, 6},
+                 /*input_data=*/{1, 2, 3, 4, 5, 6},
+                 /*expected_output_shape=*/{{6}},
+                 /*expected_output_data=*/{{1, 2, 3, 4, 5, 6}},
+                 /*type=*/TensorType_UINT8);
+}
+
+TEST(UnpackOpTest, Uint8ThreeDimensionsOutputs) {
+  Check<uint8_t>(/*axis=*/2, /*input_shape=*/{2, 2, 2},
+                 /*input_data=*/{1, 2, 3, 4, 5, 6, 7, 8},
+                 /*expected_output_shape=*/{{2, 2}, {2, 2}},
+                 /*expected_output_data=*/{{1, 3, 5, 7}, {2, 4, 6, 8}},
+                 /*type=*/TensorType_UINT8);
+}
+
+// int8 tests.
+TEST(UnpackOpTest, Int8ThreeOutputs) {
+  Check<int8_t>(/*axis=*/0, /*input_shape=*/{3, 2},
+                /*input_data=*/{1, 2, 3, 4, 5, 6},
+                /*expected_output_shape=*/{{2}, {2}, {2}},
+                /*expected_output_data=*/{{1, 2}, {3, 4}, {5, 6}},
+                /*type=*/TensorType_INT8);
+}
+
+TEST(UnpackOpTest, Int8ThreeOutputsAxisOne) {
+  Check<int8_t>(/*axis=*/1, /*input_shape=*/{3, 2},
+                /*input_data=*/{1, 2, 3, 4, 5, 6},
+                /*expected_output_shape=*/{{3}, {3}},
+                /*expected_output_data=*/{{1, 3, 5}, {2, 4, 6}},
+                /*type=*/TensorType_INT8);
+}
+
+TEST(UnpackOpTest, Int8ThreeOutputsNegativeAxisOne) {
+  Check<int8_t>(/*axis=*/-1, /*input_shape=*/{3, 2},
+                /*input_data=*/{1, 2, 3, 4, 5, 6},
+                /*expected_output_shape=*/{{3}, {3}},
+                /*expected_output_data=*/{{1, 3, 5}, {2, 4, 6}},
+                /*type=*/TensorType_INT8);
+}
+
+TEST(UnpackOpTest, Int8ThreeOutputsNegativeAxisTwo) {
+  Check<int8_t>(/*axis=*/-2, /*input_shape=*/{3, 2},
+                /*input_data=*/{1, 2, 3, 4, 5, 6},
+                /*expected_output_shape=*/{{2}, {2}, {2}},
+                /*expected_output_data=*/{{1, 2}, {3, 4}, {5, 6}},
+                /*type=*/TensorType_INT8);
+}
+
+TEST(UnpackOpTest, Int8OneOutput) {
+  Check<int8_t>(/*axis=*/0, /*input_shape=*/{1, 6},
+                /*input_data=*/{1, 2, 3, 4, 5, 6},
+                /*expected_output_shape=*/{{6}},
+                /*expected_output_data=*/{{1, 2, 3, 4, 5, 6}},
+                /*type=*/TensorType_INT8);
+}
+
+TEST(UnpackOpTest, Int8ThreeDimensionsOutputs) {
+  Check<int8_t>(/*axis=*/2, /*input_shape=*/{2, 2, 2},
+                /*input_data=*/{1, 2, 3, 4, 5, 6, 7, 8},
+                /*expected_output_shape=*/{{2, 2}, {2, 2}},
+                /*expected_output_data=*/{{1, 3, 5, 7}, {2, 4, 6, 8}},
+                /*type=*/TensorType_INT8);
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/optional_debug_tools.cc b/tensorflow/lite/optional_debug_tools.cc
index 1113bf01b17..a59af3d680c 100644
--- a/tensorflow/lite/optional_debug_tools.cc
+++ b/tensorflow/lite/optional_debug_tools.cc
@@ -56,6 +56,8 @@ const char* TensorTypeName(TfLiteType type) {
       return "kTfLiteInt16";
     case kTfLiteComplex64:
       return "kTfLiteComplex64";
+    case kTfLiteFloat16:
+      return "kTfLiteFloat16";
   }
   return "(invalid)";
 }
diff --git a/tensorflow/lite/profiling/BUILD b/tensorflow/lite/profiling/BUILD
index aa5638fc92e..452c53aff94 100644
--- a/tensorflow/lite/profiling/BUILD
+++ b/tensorflow/lite/profiling/BUILD
@@ -10,16 +10,21 @@ common_copts = [
 
 cc_library(
     name = "profiler",
-    hdrs = ["profiler.h"],
+    hdrs = [
+        "buffered_profiler.h",
+        "noop_profiler.h",
+        "profiler.h",
+    ],
     copts = common_copts,
-    deps = [":profile_buffer"],
+    deps = [
+        ":profile_buffer",
+        "//tensorflow/lite/core/api",
+    ],
 )
 
 cc_test(
     name = "profiler_test",
     srcs = ["profiler_test.cc"],
-    copts = ["-DTFLITE_PROFILING_ENABLED"],
-    defines = ["TFLITE_PROFILING_ENABLED"],
     deps = [
         ":profiler",
         "//tensorflow/lite/testing:util",
@@ -31,7 +36,10 @@ cc_library(
     name = "profile_buffer",
     hdrs = ["profile_buffer.h"],
     copts = common_copts,
-    deps = [":time"],
+    deps = [
+        ":time",
+        "//tensorflow/lite/core/api",
+    ],
 )
 
 cc_library(
@@ -58,7 +66,7 @@ cc_library(
     hdrs = ["profile_summarizer.h"],
     copts = common_copts,
     deps = [
-        ":profiler",
+        ":profile_buffer",
         "//tensorflow/core:stats_calculator_portable",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/schema:schema_fbs",
@@ -71,6 +79,7 @@ cc_test(
     copts = common_copts,
     deps = [
         ":profile_summarizer",
+        ":profiler",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/kernels:kernel_util",
@@ -83,8 +92,6 @@ cc_test(
 cc_test(
     name = "profile_buffer_test",
     srcs = ["profile_buffer_test.cc"],
-    copts = ["-DTFLITE_PROFILING_ENABLED"],
-    defines = ["TFLITE_PROFILING_ENABLED"],
     deps = [
         ":profile_buffer",
         "//tensorflow/lite/testing:util",
diff --git a/tensorflow/lite/profiling/buffered_profiler.h b/tensorflow/lite/profiling/buffered_profiler.h
new file mode 100644
index 00000000000..74acfe3b742
--- /dev/null
+++ b/tensorflow/lite/profiling/buffered_profiler.h
@@ -0,0 +1,108 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PROFILING_BUFFERED_PROFILER_H_
+#define TENSORFLOW_LITE_PROFILING_BUFFERED_PROFILER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/core/api/profiler.h"
+#include "tensorflow/lite/profiling/profile_buffer.h"
+
+namespace tflite {
+namespace profiling {
+
+// Controls whether profiling is enabled or disabled and collects profiles.
+// TFLite is used on platforms that don't have posix threads, so the profiler is
+// kept as simple as possible. It is designed to be used only on a single
+// thread.
+//
+// Profiles are collected using Scoped*Profile objects that begin and end a
+// profile event.
+// An example usage is shown in the example below:
+//
+// Say Worker class has a DoWork method and we are interested in profiling
+// the overall execution time for DoWork and time spent in Task1 and Task2
+// functions.
+//
+// class Worker {
+//  public:
+//   void DoWork() {
+//    ScopedProfile(&controller, "DoWork");
+//    Task1();
+//    Task2();
+//    .....
+//   }
+//
+//   void Task1() {
+//    ScopedProfile(&controller, "Task1");
+//    ....
+//   }
+//
+//   void Task2() {
+//    ScopedProfile(&controller, "Task2");
+//   }
+//
+//    Profiler profiler;
+// }
+//
+// We instrument the functions that need to be profiled.
+//
+// Profile can be collected by enable profiling and then getting profile
+// events.
+//
+//  void ProfileWorker() {
+//    Worker worker;
+//    worker.profiler.EnableProfiling();
+//    worker.DoWork();
+//    worker.profiler.DisableProfiling();
+//    // Profiling is complete, extract profiles.
+//    auto profile_events = worker.profiler.GetProfiles();
+//  }
+//
+//
+class BufferedProfiler : public tflite::Profiler {
+ public:
+  BufferedProfiler() : buffer_(1024, false) {}
+
+  uint32_t BeginEvent(const char* tag, EventType event_type,
+                      uint32_t event_metadata) override {
+    return buffer_.BeginEvent(tag, event_type, event_metadata);
+  }
+
+  void EndEvent(uint32_t event_handle) override {
+    buffer_.EndEvent(event_handle);
+  }
+
+  void StartProfiling() { buffer_.SetEnabled(true); }
+  void StopProfiling() { buffer_.SetEnabled(false); }
+  void Reset() { buffer_.Reset(); }
+  std::vector<const ProfileEvent*> GetProfileEvents() {
+    std::vector<const ProfileEvent*> profile_events;
+    profile_events.reserve(buffer_.Size());
+    for (size_t i = 0; i < buffer_.Size(); i++) {
+      profile_events.push_back(buffer_.At(i));
+    }
+    return profile_events;
+  }
+
+ private:
+  ProfileBuffer* GetProfileBuffer() { return &buffer_; }
+  ProfileBuffer buffer_;
+};
+
+}  // namespace profiling
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PROFILING_BUFFERED_PROFILER_H_
diff --git a/tensorflow/lite/profiling/noop_profiler.h b/tensorflow/lite/profiling/noop_profiler.h
new file mode 100644
index 00000000000..18c12e14ca3
--- /dev/null
+++ b/tensorflow/lite/profiling/noop_profiler.h
@@ -0,0 +1,43 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PROFILING_NOOP_PROFILER_H_
+#define TENSORFLOW_LITE_PROFILING_NOOP_PROFILER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/core/api/profiler.h"
+#include "tensorflow/lite/profiling/profile_buffer.h"
+
+namespace tflite {
+namespace profiling {
+
+// A noop version of profiler when profiling is disabled.
+class NoopProfiler : public tflite::Profiler {
+ public:
+  NoopProfiler() {}
+
+  uint32_t BeginEvent(const char*, EventType, uint32_t) override { return 0; }
+  void EndEvent(uint32_t) override {}
+
+  void StartProfiling() {}
+  void StopProfiling() {}
+  void Reset() {}
+  std::vector<const ProfileEvent*> GetProfileEvents() { return {}; }
+};
+
+}  // namespace profiling
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PROFILING_NOOP_PROFILER_H_
diff --git a/tensorflow/lite/profiling/profile_buffer.h b/tensorflow/lite/profiling/profile_buffer.h
index 2202df2dbe3..8e4aab454c5 100644
--- a/tensorflow/lite/profiling/profile_buffer.h
+++ b/tensorflow/lite/profiling/profile_buffer.h
@@ -18,24 +18,22 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <cstdio>
+#include <vector>
 
+#include "tensorflow/lite/core/api/profiler.h"
 #include "tensorflow/lite/profiling/time.h"
 
 namespace tflite {
 namespace profiling {
 
+constexpr uint32_t kInvalidEventHandle = static_cast<uint32_t>(~0) - 1;
+
 // A profiling event.
 struct ProfileEvent {
   // Describes the type of event.
   // The event_metadata field may contain additional data for interpreting
   // the event.
-  enum class EventType {
-    // Default event type, the metadata field has no special significance.
-    DEFAULT = 0,
-    // The event is an operator invocation and the event_metadata field is the
-    // index of operator node.
-    OPERATOR_INVOKE_EVENT = 1
-  };
+  using EventType = tflite::Profiler::EventType;
 
   // Label of the event. This usually describes the event.
   const char* tag;
@@ -49,17 +47,6 @@ struct ProfileEvent {
   // Extra data describing the details of the event.
   uint32_t event_metadata;
 };
-}  // namespace profiling
-}  // namespace tflite
-
-#ifdef TFLITE_PROFILING_ENABLED
-
-#include <sys/time.h>
-#include <vector>
-
-namespace tflite {
-namespace profiling {
-constexpr uint32_t kInvalidEventHandle = static_cast<uint32_t>(~0) - 1;
 
 // A ring buffer of profile events.
 // This class is not thread safe.
@@ -128,7 +115,7 @@ class ProfileBuffer {
   // Returns the profile event at the given index. If the index is invalid a
   // nullptr is returned. The return event may get overwritten if more events
   // are added to buffer.
-  const struct ProfileEvent* const At(size_t index) const {
+  const struct ProfileEvent* At(size_t index) const {
     size_t size = Size();
     if (index >= size) {
       return nullptr;
@@ -145,7 +132,8 @@ class ProfileBuffer {
   uint32_t current_index_;
   std::vector<ProfileEvent> event_buffer_;
 };
+
 }  // namespace profiling
 }  // namespace tflite
-#endif  // TFLITE_PROFILING_ENABLED
+
 #endif  // TENSORFLOW_LITE_PROFILING_PROFILE_BUFFER_H_
diff --git a/tensorflow/lite/profiling/profile_summarizer.h b/tensorflow/lite/profiling/profile_summarizer.h
index d4f5da7be96..d75269ec0f3 100644
--- a/tensorflow/lite/profiling/profile_summarizer.h
+++ b/tensorflow/lite/profiling/profile_summarizer.h
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/profiling/profiler.h"
 #include "tensorflow/core/util/stats_calculator.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/profiling/profile_buffer.h"
 
 namespace tflite {
 namespace profiling {
diff --git a/tensorflow/lite/profiling/profile_summarizer_test.cc b/tensorflow/lite/profiling/profile_summarizer_test.cc
index bbb64b832ae..8891ac5d7c4 100644
--- a/tensorflow/lite/profiling/profile_summarizer_test.cc
+++ b/tensorflow/lite/profiling/profile_summarizer_test.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/lite/profiling/profile_summarizer.h"
+
 #include <string>
 #include <vector>
 
@@ -22,7 +24,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/model.h"
-#include "tensorflow/lite/profiling/profile_summarizer.h"
+#include "tensorflow/lite/profiling/buffered_profiler.h"
 #include "tensorflow/lite/testing/util.h"
 #include "tensorflow/lite/version.h"
 
@@ -33,7 +35,6 @@ namespace {
 
 const char* kOpName = "SimpleOpEval";
 
-#ifdef TFLITE_PROFILING_ENABLED
 TfLiteStatus SimpleOpEval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input1 = tflite::GetInput(context, node, /*index=*/0);
   const TfLiteTensor* input2 = tflite::GetInput(context, node, /*index=*/1);
@@ -69,7 +70,6 @@ TfLiteRegistration* RegisterSimpleOpWithProfilingDetails() {
                                             1};
   return &registration;
 }
-#endif
 
 class SimpleOpModel : public SingleOpModel {
  public:
@@ -101,9 +101,8 @@ TEST(ProfileSummarizerTest, Empty) {
   EXPECT_GT(output.size(), 0);
 }
 
-#ifdef TFLITE_PROFILING_ENABLED
 TEST(ProfileSummarizerTest, Interpreter) {
-  Profiler profiler;
+  BufferedProfiler profiler;
   SimpleOpModel m;
   m.Init(RegisterSimpleOp);
   auto interpreter = m.GetInterpreter();
@@ -124,7 +123,7 @@ TEST(ProfileSummarizerTest, Interpreter) {
 }
 
 TEST(ProfileSummarizerTest, InterpreterPlusProfilingDetails) {
-  Profiler profiler;
+  BufferedProfiler profiler;
   SimpleOpModel m;
   m.Init(RegisterSimpleOpWithProfilingDetails);
   auto interpreter = m.GetInterpreter();
@@ -145,8 +144,6 @@ TEST(ProfileSummarizerTest, InterpreterPlusProfilingDetails) {
       << output;
 }
 
-#endif
-
 }  // namespace
 }  // namespace profiling
 }  // namespace tflite
diff --git a/tensorflow/lite/profiling/profiler.h b/tensorflow/lite/profiling/profiler.h
index dd45518b5bf..e75c90bf6b6 100644
--- a/tensorflow/lite/profiling/profiler.h
+++ b/tensorflow/lite/profiling/profiler.h
@@ -15,168 +15,23 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_PROFILING_PROFILER_H_
 #define TENSORFLOW_LITE_PROFILING_PROFILER_H_
 
-#include <vector>
+#include "tensorflow/lite/profiling/buffered_profiler.h"
+#include "tensorflow/lite/profiling/noop_profiler.h"
 
-#include "tensorflow/lite/profiling/profile_buffer.h"
+namespace tflite {
+namespace profiling {
 
+// TODO(b/131688504): Remove this and use runtime flags for profiler selection.
 #ifdef TFLITE_PROFILING_ENABLED
-
-namespace tflite {
-namespace profiling {
-class ScopedProfile;
-class ScopedOperatorProfile;
-
-// Controls whether profiling is enabled or disabled and collects profiles.
-// TFLite is used on platforms that don't have posix threads, so the profiler is
-// kept as simple as possible. It is designed to be used only on a single
-// thread.
-//
-// Profiles are collected using Scoped*Profile objects that begin and end a
-// profile event.
-// An example usage is shown in the example below:
-//
-// Say Worker class has a DoWork method and we are interested in profiling
-// the overall execution time for DoWork and time spent in Task1 and Task2
-// functions.
-//
-// class Worker {
-//  public:
-//   void DoWork() {
-//    ScopedProfile(&controller, "DoWork");
-//    Task1();
-//    Task2();
-//    .....
-//   }
-//
-//   void Task1() {
-//    ScopedProfile(&controller, "Task1");
-//    ....
-//   }
-//
-//   void Task2() {
-//    ScopedProfile(&controller, "Task2");
-//   }
-//
-//    Profiler profiler;
-// }
-//
-// We instrument the functions that need to be profiled.
-//
-// Profile can be collected by enable profiling and then getting profile
-// events.
-//
-//  void ProfileWorker() {
-//    Worker worker;
-//    worker.profiler.EnableProfiling();
-//    worker.DoWork();
-//    worker.profiler.DisableProfiling();
-//    // Profiling is complete, extract profiles.
-//    auto profile_events = worker.profiler.GetProfiles();
-//  }
-//
-//
-class Profiler {
- public:
-  Profiler() : buffer_(1024, false) {}
-
-  void StartProfiling() { buffer_.SetEnabled(true); }
-  void StopProfiling() { buffer_.SetEnabled(false); }
-  void Reset() { buffer_.Reset(); }
-  std::vector<const ProfileEvent*> GetProfileEvents() {
-    std::vector<const ProfileEvent*> profile_events;
-    profile_events.reserve(buffer_.Size());
-    for (size_t i = 0; i < buffer_.Size(); i++) {
-      profile_events.push_back(buffer_.At(i));
-    }
-    return profile_events;
-  }
-
- private:
-  friend class ScopedProfile;
-  friend class ScopedOperatorProfile;
-  ProfileBuffer* GetProfileBuffer() { return &buffer_; }
-  ProfileBuffer buffer_;
-};
-
-class ScopedProfile {
- public:
-  // Adds a profile event to profile that begins with the construction
-  // of object and ends when the object goes out of scope.
-  // The lifetime of tag should be at least the lifetime of profiler.
-
-  ScopedProfile(Profiler* profiler, const char* tag)
-      : buffer_(nullptr), event_handle_(0) {
-    if (profiler) {
-      buffer_ = profiler->GetProfileBuffer();
-      event_handle_ =
-          buffer_->BeginEvent(tag, ProfileEvent::EventType::DEFAULT, 0);
-    }
-  }
-  ~ScopedProfile() {
-    if (buffer_) {
-      buffer_->EndEvent(event_handle_);
-    }
-  }
-
- private:
-  ProfileBuffer* buffer_;
-  int32_t event_handle_;
-};
-
-class ScopedOperatorProfile {
- public:
-  // Adds a profile event to profile that begins with the construction
-  // of object and ends when the object goes out of scope.
-  // The lifetime of tag should be at least the lifetime of profiler.
-  ScopedOperatorProfile(Profiler* profiler, const char* tag, int node_index)
-      : buffer_(nullptr), event_handle_(0) {
-    if (profiler) {
-      buffer_ = profiler->GetProfileBuffer();
-      event_handle_ = buffer_->BeginEvent(
-          tag, ProfileEvent::EventType::OPERATOR_INVOKE_EVENT, node_index);
-    }
-  }
-
-  ~ScopedOperatorProfile() {
-    if (buffer_) {
-      buffer_->EndEvent(event_handle_);
-    }
-  }
-
- private:
-  ProfileBuffer* buffer_;
-  int32_t event_handle_;
-};
-
-}  // namespace profiling
-}  // namespace tflite
-
-#define VARNAME_UNIQ(name, ctr) name##ctr
-
-#define SCOPED_TAGGED_OPERATOR_PROFILE(profiler, tag, node_index) \
-  tflite::profiling::ScopedOperatorProfile VARNAME_UNIQ(          \
-      _profile_, __COUNTER__)((profiler), (tag), (node_index))
-#define SCOPED_OPERATOR_PROFILE(profiler, node_index) \
-  SCOPED_TAGGED_OPERATOR_PROFILE((profiler), "OpInvoke", (node_index))
+using Profiler = BufferedProfiler;
 #else
-
-namespace tflite {
-namespace profiling {
-// A noop version of profiler when profiling is disabled.
-class Profiler {
- public:
-  Profiler() {}
-  void StartProfiling() {}
-  void StopProfiling() {}
-  void Reset() {}
-  std::vector<const ProfileEvent*> GetProfileEvents() { return {}; }
-};
-}  // namespace profiling
-}  // namespace tflite
-
-#define SCOPED_TAGGED_OPERATOR_PROFILE(profiler, tag, node_index)
-#define SCOPED_OPERATOR_PROFILE(profiler, node_index)
-
+using Profiler = NoopProfiler;
 #endif  // TFLITE_PROFILING_ENABLED
 
+}  // namespace profiling
+}  // namespace tflite
+
+#define SCOPED_TAGGED_OPERATOR_PROFILE TFLITE_SCOPED_TAGGED_OPERATOR_PROFILE
+#define SCOPED_OPERATOR_PROFILE TFLITE_SCOPED_OPERATOR_PROFILE
+
 #endif  // TENSORFLOW_LITE_PROFILING_PROFILER_H_
diff --git a/tensorflow/lite/profiling/profiler_test.cc b/tensorflow/lite/profiling/profiler_test.cc
index addebabe1b1..44dc3a9cd1b 100644
--- a/tensorflow/lite/profiling/profiler_test.cc
+++ b/tensorflow/lite/profiling/profiler_test.cc
@@ -31,17 +31,17 @@ double GetDurationOfEventMs(const ProfileEvent* event) {
   return (event->end_timestamp_us - event->begin_timestamp_us) / 1e3;
 }
 
-void SleepForQuarterSecond(Profiler* profiler) {
+void SleepForQuarterSecond(tflite::Profiler* profiler) {
   ScopedProfile profile(profiler, "SleepForQuarter");
   std::this_thread::sleep_for(std::chrono::milliseconds(250));
 }
 
-void ChildFunction(Profiler* profiler) {
+void ChildFunction(tflite::Profiler* profiler) {
   ScopedProfile profile(profiler, "Child");
   SleepForQuarterSecond(profiler);
 }
 
-void ParentFunction(Profiler* profiler) {
+void ParentFunction(tflite::Profiler* profiler) {
   ScopedProfile profile(profiler, "Parent");
   for (int i = 0; i < 2; i++) {
     ChildFunction(profiler);
@@ -49,14 +49,14 @@ void ParentFunction(Profiler* profiler) {
 }
 
 TEST(ProfilerTest, NoProfilesAreCollectedWhenDisabled) {
-  Profiler profiler;
+  BufferedProfiler profiler;
   ParentFunction(&profiler);
   auto profile_events = profiler.GetProfileEvents();
   EXPECT_EQ(0, profile_events.size());
 }
 
 TEST(ProfilingTest, ProfilesAreCollected) {
-  Profiler profiler;
+  BufferedProfiler profiler;
   profiler.StartProfiling();
   ParentFunction(&profiler);
   profiler.StopProfiling();
@@ -101,7 +101,7 @@ TEST(ProfilingTest, NullProfiler) {
 }
 
 TEST(ProfilingTest, ScopedProfile) {
-  Profiler profiler;
+  BufferedProfiler profiler;
   profiler.StartProfiling();
   { SCOPED_OPERATOR_PROFILE(&profiler, 1); }
   profiler.StopProfiling();
@@ -109,6 +109,15 @@ TEST(ProfilingTest, ScopedProfile) {
   EXPECT_EQ(1, profile_events.size());
 }
 
+TEST(ProfilingTest, NoopProfiler) {
+  NoopProfiler profiler;
+  profiler.StartProfiling();
+  { SCOPED_OPERATOR_PROFILE(&profiler, 1); }
+  profiler.StopProfiling();
+  auto profile_events = profiler.GetProfileEvents();
+  EXPECT_EQ(0, profile_events.size());
+}
+
 }  // namespace
 }  // namespace profiling
 }  // namespace tflite
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index 72b85d36a4f..17953e21d03 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -21,6 +21,7 @@ py_test(
     name = "interpreter_test",
     srcs = ["interpreter_test.py"],
     data = ["//tensorflow/lite/python/testdata:interpreter_test_data"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_windows",
@@ -37,6 +38,7 @@ py_test(
 py_binary(
     name = "tflite_convert",
     srcs = ["tflite_convert.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [":tflite_convert_main_lib"],
@@ -86,6 +88,7 @@ py_test(
     name = "lite_test",
     srcs = ["lite_test.py"],
     data = ["@tflite_mobilenet_ssd_quant_protobuf//:tflite_graph.pb"],
+    python_version = "PY2",
     shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
@@ -101,6 +104,7 @@ py_test(
 py_test(
     name = "lite_v2_test",
     srcs = ["lite_v2_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_windows",
@@ -115,6 +119,7 @@ py_test(
 py_test(
     name = "lite_flex_test",
     srcs = ["lite_flex_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         # TODO(b/111881877): Enable in oss after resolving op registry issues.
@@ -145,6 +150,7 @@ py_library(
 py_test(
     name = "util_test",
     srcs = ["util_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_windows",
@@ -210,6 +216,7 @@ py_library(
 py_test(
     name = "convert_test",
     srcs = ["convert_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":convert",
@@ -241,6 +248,7 @@ py_library(
 py_test(
     name = "convert_saved_model_test",
     srcs = ["convert_saved_model_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_windows",
@@ -262,6 +270,7 @@ py_test(
 py_binary(
     name = "create_custom_op",
     srcs = ["create_custom_op.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
diff --git a/tensorflow/lite/python/interpreter_wrapper/python_utils.cc b/tensorflow/lite/python/interpreter_wrapper/python_utils.cc
index 22ec88bafd5..110c3ac4e04 100644
--- a/tensorflow/lite/python/interpreter_wrapper/python_utils.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/python_utils.cc
@@ -32,6 +32,8 @@ int TfLiteTypeToPyArrayType(TfLiteType tf_lite_type) {
   switch (tf_lite_type) {
     case kTfLiteFloat32:
       return NPY_FLOAT32;
+    case kTfLiteFloat16:
+      return NPY_FLOAT16;
     case kTfLiteInt32:
       return NPY_INT32;
     case kTfLiteInt16:
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index fefa1c09496..ec8aa1dccf3 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -295,7 +295,10 @@ class TFLiteConverterV2(TFLiteConverterBase):
     Raises:
       Invalid signature keys.
     """
-    saved_model = _load(saved_model_dir, tags)
+    # Ensures any graphs created in Eager mode are able to run. This is required
+    # in order to create a tf.estimator.Exporter that exports a TFLite model.
+    with context.eager_mode():
+      saved_model = _load(saved_model_dir, tags)
     if not signature_keys:
       signature_keys = saved_model.signatures
 
diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index d082edb88ad..d2a82bb438a 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import os
 import tempfile
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.lite.python import lite
@@ -27,6 +28,7 @@ from tensorflow.lite.python import lite_constants
 from tensorflow.lite.python.interpreter import Interpreter
 from tensorflow.python import keras
 from tensorflow.python.client import session
+from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -736,6 +738,84 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     self.assertTrue(([1] == output_details[0]['shape']).all())
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
+  def testInferenceInputOutputTypeFloatDefault(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    out_tensor = in_tensor + in_tensor
+    sess = session.Session()
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
+                                                  [out_tensor])
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(1, len(input_details))
+    self.assertEqual('Placeholder', input_details[0]['name'])
+    self.assertEqual(np.float32, input_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual('add', output_details[0]['name'])
+    self.assertEqual(np.float32, output_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+
+  def testInferenceInputOutputTypeQuantizedUint8Default(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    out_tensor = array_ops.fake_quant_with_min_max_args(
+        in_tensor + in_tensor, min=0., max=1., name='output')
+    sess = session.Session()
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
+                                                  [out_tensor])
+    converter.inference_type = lite_constants.QUANTIZED_UINT8
+    converter.quantized_input_stats = {'Placeholder': (0., 1.)}  # mean, std_dev
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(1, len(input_details))
+    self.assertEqual('Placeholder', input_details[0]['name'])
+    self.assertEqual(np.uint8, input_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual('output', output_details[0]['name'])
+    self.assertEqual(np.uint8, output_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+
+  def testReusingConverterWithDifferentPostTrainingQuantization(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    out_tensor = array_ops.fake_quant_with_min_max_args(
+        in_tensor + in_tensor, min=0., max=1., name='output')
+    sess = session.Session()
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
+                                                  [out_tensor])
+
+    converter.post_training_quantize = True
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    converter.post_training_quantize = False
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
 
 @test_util.run_v1_only('Incompatible with 2.0.')
 class FromFrozenGraphFile(test_util.TensorFlowTestCase):
@@ -1148,62 +1228,70 @@ class MyAddLayer(keras.layers.Layer):
 
 
 @test_util.run_v1_only('Incompatible with 2.0.')
-class FromKerasFile(test_util.TensorFlowTestCase):
+class FromKerasFile(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def setUp(self):
-    keras.backend.clear_session()
+    super(FromKerasFile, self).setUp()
+    self._keras_file = None
+    self._custom_objects = None
+    if not context.executing_eagerly():
+      keras.backend.clear_session()
+
+  def tearDown(self):
+    if self._keras_file:
+      os.remove(self._keras_file)
+    super(FromKerasFile, self).tearDown()
 
   def _getSequentialModel(self, include_custom_layer=False):
-    with session.Session().as_default():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_shape=(3,)))
-      if include_custom_layer:
-        model.add(MyAddLayer(1.0))
-      model.add(keras.layers.RepeatVector(3))
-      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
-      model.compile(
-          loss=keras.losses.MSE,
-          optimizer=keras.optimizers.RMSprop(),
-          metrics=[keras.metrics.categorical_accuracy],
-          sample_weight_mode='temporal')
-      x = np.random.random((1, 3))
-      y = np.random.random((1, 3, 3))
-      model.train_on_batch(x, y)
-      model.predict(x)
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(2, input_shape=(3,)))
+    if include_custom_layer:
+      model.add(MyAddLayer(1.0))
+    model.add(keras.layers.RepeatVector(3))
+    model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
+    model.compile(
+        loss=keras.losses.MSE,
+        optimizer='sgd',
+        metrics=[keras.metrics.categorical_accuracy],
+        sample_weight_mode='temporal')
+    x = np.random.random((1, 3))
+    y = np.random.random((1, 3, 3))
+    model.train_on_batch(x, y)
+    model.predict(x)
 
-      try:
-        fd, keras_file = tempfile.mkstemp('.h5')
-        keras.models.save_model(model, keras_file)
-      finally:
-        os.close(fd)
+    try:
+      fd, self._keras_file = tempfile.mkstemp('.h5')
+      keras.models.save_model(model, self._keras_file)
+    finally:
+      os.close(fd)
 
-      if include_custom_layer:
-        custom_objects = {'MyAddLayer': MyAddLayer}
-        return keras_file, custom_objects
-      return keras_file
+    if include_custom_layer:
+      self._custom_objects = {'MyAddLayer': MyAddLayer}
 
-  def testSequentialModel(self):
+  @parameterized.named_parameters(('_graph', context.graph_mode),
+                                  ('_eager', context.eager_mode))
+  def testSequentialModel(self, test_context):
     """Test a Sequential tf.keras model with default inputs."""
-    keras_file = self._getSequentialModel()
+    with test_context():
+      self._getSequentialModel()
 
-    converter = lite.TFLiteConverter.from_keras_model_file(keras_file)
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+      converter = lite.TFLiteConverter.from_keras_model_file(self._keras_file)
+      tflite_model = converter.convert()
+      self.assertTrue(tflite_model)
 
     # Check tensor details of converted model.
     interpreter = Interpreter(model_content=tflite_model)
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
+    self.assertLen(input_details, 1)
     self.assertEqual('dense_input', input_details[0]['name'])
     self.assertEqual(np.float32, input_details[0]['dtype'])
     self.assertTrue(([1, 3] == input_details[0]['shape']).all())
     self.assertEqual((0., 0.), input_details[0]['quantization'])
 
     output_details = interpreter.get_output_details()
-    self.assertEqual(1, len(output_details))
-    self.assertEqual('time_distributed/Reshape_1', output_details[0]['name'])
+    self.assertLen(output_details, 1)
     self.assertEqual(np.float32, output_details[0]['dtype'])
     self.assertTrue(([1, 3, 3] == output_details[0]['shape']).all())
     self.assertEqual((0., 0.), output_details[0]['quantization'])
@@ -1214,22 +1302,22 @@ class FromKerasFile(test_util.TensorFlowTestCase):
     interpreter.invoke()
     tflite_result = interpreter.get_tensor(output_details[0]['index'])
 
-    keras_model = keras.models.load_model(keras_file)
+    keras_model = keras.models.load_model(self._keras_file)
     keras_result = keras_model.predict(input_data)
 
     np.testing.assert_almost_equal(tflite_result, keras_result, 5)
-    os.remove(keras_file)
 
-  def testCustomLayer(self):
+  @parameterized.named_parameters(('_graph', context.graph_mode),
+                                  ('_eager', context.eager_mode))
+  def testCustomLayer(self, test_context):
     """Test a Sequential tf.keras model with default inputs."""
-    keras_file, custom_objects = self._getSequentialModel(
-        include_custom_layer=True)
+    with test_context():
+      self._getSequentialModel(include_custom_layer=True)
 
-    converter = lite.TFLiteConverter.from_keras_model_file(
-        keras_file, custom_objects=custom_objects)
-
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+      converter = lite.TFLiteConverter.from_keras_model_file(
+          self._keras_file, custom_objects=self._custom_objects)
+      tflite_model = converter.convert()
+      self.assertTrue(tflite_model)
 
     # Check tensor details of converted model.
     interpreter = Interpreter(model_content=tflite_model)
@@ -1245,47 +1333,44 @@ class FromKerasFile(test_util.TensorFlowTestCase):
     tflite_result = interpreter.get_tensor(output_details[0]['index'])
 
     keras_model = keras.models.load_model(
-        keras_file, custom_objects=custom_objects)
+        self._keras_file, custom_objects=self._custom_objects)
     keras_result = keras_model.predict(input_data)
 
     np.testing.assert_almost_equal(tflite_result, keras_result, 5)
-    os.remove(keras_file)
 
   def testSequentialModelInputArray(self):
     """Test a Sequential tf.keras model testing input arrays argument."""
-    keras_file = self._getSequentialModel()
+    self._getSequentialModel()
 
     # Invalid input array raises error.
     with self.assertRaises(ValueError) as error:
       lite.TFLiteConverter.from_keras_model_file(
-          keras_file, input_arrays=['invalid-input'])
+          self._keras_file, input_arrays=['invalid-input'])
     self.assertEqual("Invalid tensors 'invalid-input' were found.",
                      str(error.exception))
 
     # Valid input array.
     converter = lite.TFLiteConverter.from_keras_model_file(
-        keras_file, input_arrays=['dense_input'])
+        self._keras_file, input_arrays=['dense_input'])
     tflite_model = converter.convert()
-    os.remove(keras_file)
     self.assertTrue(tflite_model)
 
   def testSequentialModelInputShape(self):
     """Test a Sequential tf.keras model testing input shapes argument."""
-    keras_file = self._getSequentialModel()
+    self._getSequentialModel()
 
     # Passing in shape of invalid input array raises error.
     with self.assertRaises(ValueError) as error:
       converter = lite.TFLiteConverter.from_keras_model_file(
-          keras_file, input_shapes={'invalid-input': [2, 3]})
+          self._keras_file, input_shapes={'invalid-input': [2, 3]})
     self.assertEqual(
         "Invalid tensor 'invalid-input' found in tensor shapes map.",
         str(error.exception))
 
     # Passing in shape of valid input array.
     converter = lite.TFLiteConverter.from_keras_model_file(
-        keras_file, input_shapes={'dense_input': [2, 3]})
+        self._keras_file, input_shapes={'dense_input': [2, 3]})
     tflite_model = converter.convert()
-    os.remove(keras_file)
     self.assertTrue(tflite_model)
 
     # Check input shape from converted model.
@@ -1293,31 +1378,32 @@ class FromKerasFile(test_util.TensorFlowTestCase):
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
+    self.assertLen(input_details, 1)
     self.assertEqual('dense_input', input_details[0]['name'])
     self.assertTrue(([2, 3] == input_details[0]['shape']).all())
 
   def testSequentialModelOutputArray(self):
     """Test a Sequential tf.keras model testing output arrays argument."""
-    keras_file = self._getSequentialModel()
+    self._getSequentialModel()
 
     # Invalid output array raises error.
     with self.assertRaises(ValueError) as error:
       lite.TFLiteConverter.from_keras_model_file(
-          keras_file, output_arrays=['invalid-output'])
+          self._keras_file, output_arrays=['invalid-output'])
     self.assertEqual("Invalid tensors 'invalid-output' were found.",
                      str(error.exception))
 
     # Valid output array.
     converter = lite.TFLiteConverter.from_keras_model_file(
-        keras_file, output_arrays=['time_distributed/Reshape_1'])
+        self._keras_file, output_arrays=['time_distributed/Reshape_1'])
     tflite_model = converter.convert()
-    os.remove(keras_file)
     self.assertTrue(tflite_model)
 
-  def testFunctionalModel(self):
+  @parameterized.named_parameters(('_graph', context.graph_mode),
+                                  ('_eager', context.eager_mode))
+  def testFunctionalModel(self, test_context):
     """Test a Functional tf.keras model with default inputs."""
-    with session.Session().as_default():
+    with test_context():
       inputs = keras.layers.Input(shape=(3,), name='input')
       x = keras.layers.Dense(2)(inputs)
       output = keras.layers.Dense(3)(x)
@@ -1325,38 +1411,37 @@ class FromKerasFile(test_util.TensorFlowTestCase):
       model = keras.models.Model(inputs, output)
       model.compile(
           loss=keras.losses.MSE,
-          optimizer=keras.optimizers.RMSprop(),
+          optimizer='sgd',
           metrics=[keras.metrics.categorical_accuracy])
       x = np.random.random((1, 3))
       y = np.random.random((1, 3))
       model.train_on_batch(x, y)
 
       model.predict(x)
-      fd, keras_file = tempfile.mkstemp('.h5')
+      fd, self._keras_file = tempfile.mkstemp('.h5')
       try:
-        keras.models.save_model(model, keras_file)
+        keras.models.save_model(model, self._keras_file)
       finally:
         os.close(fd)
 
-    # Convert to TFLite model.
-    converter = lite.TFLiteConverter.from_keras_model_file(keras_file)
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+      # Convert to TFLite model.
+      converter = lite.TFLiteConverter.from_keras_model_file(self._keras_file)
+      tflite_model = converter.convert()
+      self.assertTrue(tflite_model)
 
     # Check tensor details of converted model.
     interpreter = Interpreter(model_content=tflite_model)
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
+    self.assertLen(input_details, 1)
     self.assertEqual('input', input_details[0]['name'])
     self.assertEqual(np.float32, input_details[0]['dtype'])
     self.assertTrue(([1, 3] == input_details[0]['shape']).all())
     self.assertEqual((0., 0.), input_details[0]['quantization'])
 
     output_details = interpreter.get_output_details()
-    self.assertEqual(1, len(output_details))
-    self.assertEqual('dense_1/BiasAdd', output_details[0]['name'])
+    self.assertLen(output_details, 1)
     self.assertEqual(np.float32, output_details[0]['dtype'])
     self.assertTrue(([1, 3] == output_details[0]['shape']).all())
     self.assertEqual((0., 0.), output_details[0]['quantization'])
@@ -1367,55 +1452,51 @@ class FromKerasFile(test_util.TensorFlowTestCase):
     interpreter.invoke()
     tflite_result = interpreter.get_tensor(output_details[0]['index'])
 
-    keras_model = keras.models.load_model(keras_file)
+    keras_model = keras.models.load_model(self._keras_file)
     keras_result = keras_model.predict(input_data)
 
     np.testing.assert_almost_equal(tflite_result, keras_result, 5)
-    os.remove(keras_file)
 
   def testFunctionalModelMultipleInputs(self):
     """Test a Functional tf.keras model with multiple inputs and outputs."""
-    with session.Session().as_default():
-      a = keras.layers.Input(shape=(3,), name='input_a')
-      b = keras.layers.Input(shape=(3,), name='input_b')
-      dense = keras.layers.Dense(4, name='dense')
-      c = dense(a)
-      d = dense(b)
-      e = keras.layers.Dropout(0.5, name='dropout')(c)
+    a = keras.layers.Input(shape=(3,), name='input_a')
+    b = keras.layers.Input(shape=(3,), name='input_b')
+    dense = keras.layers.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = keras.layers.Dropout(0.5, name='dropout')(c)
 
-      model = keras.models.Model([a, b], [d, e])
-      model.compile(
-          loss=keras.losses.MSE,
-          optimizer=keras.optimizers.RMSprop(),
-          metrics=[keras.metrics.mae],
-          loss_weights=[1., 0.5])
+    model = keras.models.Model([a, b], [d, e])
+    model.compile(
+        loss=keras.losses.MSE,
+        optimizer='sgd',
+        metrics=[keras.metrics.mae],
+        loss_weights=[1., 0.5])
 
-      input_a_np = np.random.random((10, 3))
-      input_b_np = np.random.random((10, 3))
-      output_d_np = np.random.random((10, 4))
-      output_e_np = np.random.random((10, 4))
-      model.train_on_batch([input_a_np, input_b_np], [output_d_np, output_e_np])
+    input_a_np = np.random.random((10, 3))
+    input_b_np = np.random.random((10, 3))
+    output_d_np = np.random.random((10, 4))
+    output_e_np = np.random.random((10, 4))
+    model.train_on_batch([input_a_np, input_b_np], [output_d_np, output_e_np])
 
-      model.predict([input_a_np, input_b_np], batch_size=5)
-      fd, keras_file = tempfile.mkstemp('.h5')
-      try:
-        keras.models.save_model(model, keras_file)
-      finally:
-        os.close(fd)
+    model.predict([input_a_np, input_b_np], batch_size=5)
+    fd, self._keras_file = tempfile.mkstemp('.h5')
+    try:
+      keras.models.save_model(model, self._keras_file)
+    finally:
+      os.close(fd)
 
     # Convert to TFLite model.
-    converter = lite.TFLiteConverter.from_keras_model_file(keras_file)
+    converter = lite.TFLiteConverter.from_keras_model_file(self._keras_file)
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
-    os.remove(keras_file)
-
     # Check values from converted model.
     interpreter = Interpreter(model_content=tflite_model)
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
-    self.assertEqual(2, len(input_details))
+    self.assertLen(input_details, 2)
     self.assertEqual('input_a', input_details[0]['name'])
     self.assertEqual(np.float32, input_details[0]['dtype'])
     self.assertTrue(([1, 3] == input_details[0]['shape']).all())
@@ -1427,7 +1508,7 @@ class FromKerasFile(test_util.TensorFlowTestCase):
     self.assertEqual((0., 0.), input_details[1]['quantization'])
 
     output_details = interpreter.get_output_details()
-    self.assertEqual(2, len(output_details))
+    self.assertLen(output_details, 2)
     self.assertEqual('dense_1/BiasAdd', output_details[0]['name'])
     self.assertEqual(np.float32, output_details[0]['dtype'])
     self.assertTrue(([1, 4] == output_details[0]['shape']).all())
@@ -1440,32 +1521,31 @@ class FromKerasFile(test_util.TensorFlowTestCase):
 
   def testFunctionalSequentialModel(self):
     """Test a Functional tf.keras model containing a Sequential model."""
-    with session.Session().as_default():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_shape=(3,)))
-      model.add(keras.layers.RepeatVector(3))
-      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
-      model = keras.models.Model(model.input, model.output)
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(2, input_shape=(3,)))
+    model.add(keras.layers.RepeatVector(3))
+    model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
+    model = keras.models.Model(model.input, model.output)
 
-      model.compile(
-          loss=keras.losses.MSE,
-          optimizer=keras.optimizers.RMSprop(),
-          metrics=[keras.metrics.categorical_accuracy],
-          sample_weight_mode='temporal')
-      x = np.random.random((1, 3))
-      y = np.random.random((1, 3, 3))
-      model.train_on_batch(x, y)
-      model.predict(x)
+    model.compile(
+        loss=keras.losses.MSE,
+        optimizer='sgd',
+        metrics=[keras.metrics.categorical_accuracy],
+        sample_weight_mode='temporal')
+    x = np.random.random((1, 3))
+    y = np.random.random((1, 3, 3))
+    model.train_on_batch(x, y)
+    model.predict(x)
 
-      model.predict(x)
-      fd, keras_file = tempfile.mkstemp('.h5')
-      try:
-        keras.models.save_model(model, keras_file)
-      finally:
-        os.close(fd)
+    model.predict(x)
+    fd, self._keras_file = tempfile.mkstemp('.h5')
+    try:
+      keras.models.save_model(model, self._keras_file)
+    finally:
+      os.close(fd)
 
     # Convert to TFLite model.
-    converter = lite.TFLiteConverter.from_keras_model_file(keras_file)
+    converter = lite.TFLiteConverter.from_keras_model_file(self._keras_file)
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
@@ -1474,14 +1554,14 @@ class FromKerasFile(test_util.TensorFlowTestCase):
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
+    self.assertLen(input_details, 1)
     self.assertEqual('dense_input', input_details[0]['name'])
     self.assertEqual(np.float32, input_details[0]['dtype'])
     self.assertTrue(([1, 3] == input_details[0]['shape']).all())
     self.assertEqual((0., 0.), input_details[0]['quantization'])
 
     output_details = interpreter.get_output_details()
-    self.assertEqual(1, len(output_details))
+    self.assertLen(output_details, 1)
     self.assertEqual('time_distributed/Reshape_1', output_details[0]['name'])
     self.assertEqual(np.float32, output_details[0]['dtype'])
     self.assertTrue(([1, 3, 3] == output_details[0]['shape']).all())
@@ -1493,17 +1573,16 @@ class FromKerasFile(test_util.TensorFlowTestCase):
     interpreter.invoke()
     tflite_result = interpreter.get_tensor(output_details[0]['index'])
 
-    keras_model = keras.models.load_model(keras_file)
+    keras_model = keras.models.load_model(self._keras_file)
     keras_result = keras_model.predict(input_data)
 
     np.testing.assert_almost_equal(tflite_result, keras_result, 5)
-    os.remove(keras_file)
 
   def testSequentialModelTocoConverter(self):
     """Test a Sequential tf.keras model with deprecated TocoConverter."""
-    keras_file = self._getSequentialModel()
+    self._getSequentialModel()
 
-    converter = lite.TocoConverter.from_keras_model_file(keras_file)
+    converter = lite.TocoConverter.from_keras_model_file(self._keras_file)
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
@@ -1511,84 +1590,6 @@ class FromKerasFile(test_util.TensorFlowTestCase):
     interpreter = Interpreter(model_content=tflite_model)
     interpreter.allocate_tensors()
 
-  def testInferenceInputOutputTypeFloatDefault(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    out_tensor = in_tensor + in_tensor
-    sess = session.Session()
-
-    # Convert model and ensure model is not None.
-    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
-                                                  [out_tensor])
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
-
-    # Check values from converted model.
-    interpreter = Interpreter(model_content=tflite_model)
-    interpreter.allocate_tensors()
-
-    input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
-    self.assertEqual('Placeholder', input_details[0]['name'])
-    self.assertEqual(np.float32, input_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
-
-    output_details = interpreter.get_output_details()
-    self.assertEqual(1, len(output_details))
-    self.assertEqual('add', output_details[0]['name'])
-    self.assertEqual(np.float32, output_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
-
-  def testInferenceInputOutputTypeQuantizedUint8Default(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    out_tensor = array_ops.fake_quant_with_min_max_args(
-        in_tensor + in_tensor, min=0., max=1., name='output')
-    sess = session.Session()
-
-    # Convert model and ensure model is not None.
-    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
-                                                  [out_tensor])
-    converter.inference_type = lite_constants.QUANTIZED_UINT8
-    converter.quantized_input_stats = {'Placeholder': (0., 1.)}  # mean, std_dev
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
-
-    # Check values from converted model.
-    interpreter = Interpreter(model_content=tflite_model)
-    interpreter.allocate_tensors()
-
-    input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
-    self.assertEqual('Placeholder', input_details[0]['name'])
-    self.assertEqual(np.uint8, input_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
-
-    output_details = interpreter.get_output_details()
-    self.assertEqual(1, len(output_details))
-    self.assertEqual('output', output_details[0]['name'])
-    self.assertEqual(np.uint8, output_details[0]['dtype'])
-    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
-
-  def testReusingConverterWithDifferentPostTrainingQuantization(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    out_tensor = array_ops.fake_quant_with_min_max_args(
-        in_tensor + in_tensor, min=0., max=1., name='output')
-    sess = session.Session()
-
-    # Convert model and ensure model is not None.
-    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
-                                                  [out_tensor])
-
-    converter.post_training_quantize = True
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
-
-    converter.post_training_quantize = False
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
-
 
 @test_util.run_v1_only('Incompatible with 2.0.')
 class GrapplerTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/lite/python/optimize/BUILD b/tensorflow/lite/python/optimize/BUILD
index bac99cc269a..61968fb6d8b 100644
--- a/tensorflow/lite/python/optimize/BUILD
+++ b/tensorflow/lite/python/optimize/BUILD
@@ -58,6 +58,7 @@ py_test(
         ":test_data",
         "//tensorflow/lite:testdata/multi_add.bin",
     ],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_oss"],
     deps = [
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.cc b/tensorflow/lite/python/optimize/calibration_wrapper.cc
index 285935dc9df..8ea376c835a 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper.cc
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.cc
@@ -61,6 +61,8 @@ inline TensorType TfLiteTypeToSchemaType(TfLiteType type) {
       return TensorType_FLOAT32;  // TODO(b/129336260): No schema type for none.
     case kTfLiteFloat32:
       return TensorType_FLOAT32;
+    case kTfLiteFloat16:
+      return TensorType_FLOAT16;
     case kTfLiteInt32:
       return TensorType_INT32;
     case kTfLiteUInt8:
diff --git a/tensorflow/lite/python/util.py b/tensorflow/lite/python/util.py
index 0331aa70208..3a0352f331c 100644
--- a/tensorflow/lite/python/util.py
+++ b/tensorflow/lite/python/util.py
@@ -31,6 +31,7 @@ from tensorflow.python.training.saver import export_meta_graph as _export_meta_g
 # Map of tf.dtypes to TFLite types_flag_pb2.
 _MAP_TF_TO_TFLITE_TYPES = {
     dtypes.float32: _types_pb2.FLOAT,
+    dtypes.float16: _types_pb2.FLOAT16,
     dtypes.int32: _types_pb2.INT32,
     dtypes.int64: _types_pb2.INT64,
     dtypes.string: _types_pb2.STRING,
diff --git a/tensorflow/lite/python/util_test.py b/tensorflow/lite/python/util_test.py
index cfb5ed365f6..65b53bc8afe 100644
--- a/tensorflow/lite/python/util_test.py
+++ b/tensorflow/lite/python/util_test.py
@@ -50,6 +50,8 @@ class UtilTest(test_util.TensorFlowTestCase):
     self.assertEqual(
         util.convert_dtype_to_tflite_type(dtypes.complex64),
         _types_pb2.COMPLEX64)
+    self.assertEqual(
+        util.convert_dtype_to_tflite_type(dtypes.half), _types_pb2.FLOAT16)
     with self.assertRaises(ValueError):
       util.convert_dtype_to_tflite_type(dtypes.bool)
 
diff --git a/tensorflow/lite/toco/graph_transformations/quantize.cc b/tensorflow/lite/toco/graph_transformations/quantize.cc
index 03cb6597738..62eaba0d756 100644
--- a/tensorflow/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/lite/toco/graph_transformations/quantize.cc
@@ -64,7 +64,7 @@ bool SupportsQuantization(const Operator& op) {
          type == OperatorType::kRelu1 || type == OperatorType::kRelu6 ||
          type == OperatorType::kLeakyRelu || type == OperatorType::kShape ||
          type == OperatorType::kExpandDims || type == OperatorType::kPack ||
-         type == OperatorType::kTopK_V2 ||
+         type == OperatorType::kUnpack || type == OperatorType::kTopK_V2 ||
          type == OperatorType::kRandomUniform ||
          type == OperatorType::kResizeNearestNeighbor ||
          type == OperatorType::kPRelu || type == OperatorType::kReduceMax ||
diff --git a/tensorflow/lite/toco/model.h b/tensorflow/lite/toco/model.h
index fcee42c2294..67510c2b3b1 100644
--- a/tensorflow/lite/toco/model.h
+++ b/tensorflow/lite/toco/model.h
@@ -223,6 +223,7 @@ enum class ArrayDataType : uint8 {
   kUint64,  // 10
   kString,
   kComplex64,
+  kFloat16,
 };
 
 // Compile-time logic to map ArrayDataType to the corresponding C++ scalar type
diff --git a/tensorflow/lite/toco/python/BUILD b/tensorflow/lite/toco/python/BUILD
index 2f5654c56e0..122117b36e6 100644
--- a/tensorflow/lite/toco/python/BUILD
+++ b/tensorflow/lite/toco/python/BUILD
@@ -61,6 +61,7 @@ tf_py_wrap_cc(
 py_binary(
     name = "toco_from_protos",
     srcs = ["toco_from_protos.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":tensorflow_wrap_toco",
diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc
index 15c4d7457b1..09fe72f1ae0 100644
--- a/tensorflow/lite/toco/tflite/operator.cc
+++ b/tensorflow/lite/toco/tflite/operator.cc
@@ -1806,6 +1806,13 @@ class Unpack : public BuiltinOperator<UnpackOperator, ::tflite::UnpackOptions,
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8/uint8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8 ||
+        input_array.data_type == ArrayDataType::kUint8) {
+      return 2;
+    }
     return 1;
   }
 };
diff --git a/tensorflow/lite/toco/tflite/operator_test.cc b/tensorflow/lite/toco/tflite/operator_test.cc
index 937b69e331e..eece77327cb 100644
--- a/tensorflow/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/lite/toco/tflite/operator_test.cc
@@ -818,6 +818,31 @@ TEST_F(OperatorTest, VersioningPackTest) {
   SimpleVersioningTest<PackOperator>();
 }
 
+TEST_F(OperatorTest, VersioningUnpackTest) {
+  UnpackOperator op;
+  op.inputs = {"input1"};
+  auto operator_by_type_map = BuildOperatorByTypeMap(false /*enable_flex_ops*/);
+  const BaseOperator* base_op = operator_by_type_map.at(op.type).get();
+
+  Model int32_model;
+  Array& int32_array = int32_model.GetOrCreateArray(op.inputs[0]);
+  int32_array.data_type = ArrayDataType::kInt32;
+  OperatorSignature int32_signature = {.op = &op, .model = &int32_model};
+  EXPECT_EQ(base_op->GetVersion(int32_signature), 1);
+
+  Model uint8_model;
+  Array& uint8_array = uint8_model.GetOrCreateArray(op.inputs[0]);
+  uint8_array.data_type = ArrayDataType::kUint8;
+  OperatorSignature uint8_signature = {.op = &op, .model = &uint8_model};
+  EXPECT_EQ(base_op->GetVersion(uint8_signature), 2);
+
+  Model int8_model;
+  Array& int8_array = int8_model.GetOrCreateArray(op.inputs[0]);
+  int8_array.data_type = ArrayDataType::kInt8;
+  OperatorSignature int8_signature = {.op = &op, .model = &int8_model};
+  EXPECT_EQ(base_op->GetVersion(int8_signature), 2);
+}
+
 TEST_F(OperatorTest, VersioningBatchToSpaceNDTest) {
   SimpleVersioningTest<BatchToSpaceNDOperator>();
 }
diff --git a/tensorflow/lite/toco/types.proto b/tensorflow/lite/toco/types.proto
index fa911b8a4c8..2c655517431 100644
--- a/tensorflow/lite/toco/types.proto
+++ b/tensorflow/lite/toco/types.proto
@@ -46,4 +46,7 @@ enum IODataType {
 
   // Int8, quantized based on QuantizationParameters in schema.
   INT8 = 9;
+
+  // Half precision float, not quantized.
+  FLOAT16 = 10;
 }
diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD
index d4428f4d498..c692b948692 100644
--- a/tensorflow/lite/tools/benchmark/BUILD
+++ b/tensorflow/lite/tools/benchmark/BUILD
@@ -89,9 +89,9 @@ cc_library(
         ":logging",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string_util",
-        "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/profiling:profile_summarizer",
+        "//tensorflow/lite/profiling:profiler",
         "//tensorflow/lite/tools/evaluation:utils",
         "@gemmlowp",
     ],
diff --git a/tensorflow/lite/tools/benchmark/README.md b/tensorflow/lite/tools/benchmark/README.md
index e432d81f8ec..d5c89bd266b 100644
--- a/tensorflow/lite/tools/benchmark/README.md
+++ b/tensorflow/lite/tools/benchmark/README.md
@@ -45,6 +45,8 @@ and the following optional parameters:
 *   `use_gpu`: `bool` (default=false) \
     Whether to use the [GPU accelerator delegate](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/delegates/gpu).
     This option is currently only available on Android devices.
+*   `enable_op_profiling`: `bool` (default=false) \
+    Whether to enable per-operator profiling measurement.
 
 ## To build/install/run
 
@@ -129,19 +131,18 @@ where `f0` is the affinity mask for big cores on Pixel 2.
 Note: The affinity mask varies with the device.
 
 ## Profiling model operators
-The benchmark model binary also allows you to profile operators and give execution times of each operator. To do this,
-compile the binary with a compiler flag that enables profiling to be compiled in. Pass **--copt=-DTFLITE_PROFILING_ENABLED**
-to compile benchmark with profiling support.
-For example, to compile with profiling support on Android, add this flag to the previous command:
+The benchmark model binary also allows you to profile operators and give
+execution times of each operator. To do this, pass the flag
+`--enable_op_profiling=true` to `benchmark_model` during invocation, e.g.,
 
 ```
-bazel build -c opt \
-  --config=android_arm \
-  --cxxopt='--std=c++11' \
-  --copt=-DTFLITE_PROFILING_ENABLED \
-  tensorflow/lite/tools/benchmark:benchmark_model
+adb shell taskset f0 /data/local/tmp/benchmark_model \
+  --graph=/data/local/tmp/mobilenet_quant_v1_224.tflite \
+  --enable_op_profiling=true
 ```
-This compiles TFLite with profiling enabled, now you can run the benchmark binary like before. The binary will produce detailed statistics for each operation similar to those shown below:
+
+When enabled, the `benchmark_model` binary will produce detailed statistics for
+each operation similar to those shown below:
 
 ```
 
diff --git a/tensorflow/lite/tools/benchmark/benchmark_test.cc b/tensorflow/lite/tools/benchmark/benchmark_test.cc
index c7fbc24a477..8fd625cf141 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_test.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_test.cc
@@ -48,6 +48,7 @@ BenchmarkParams CreateParams() {
   params.AddParam("warmup_min_secs", BenchmarkParam::Create<float>(0.5f));
   params.AddParam("use_legacy_nnapi", BenchmarkParam::Create<bool>(false));
   params.AddParam("use_gpu", BenchmarkParam::Create<bool>(false));
+  params.AddParam("enable_op_profiling", BenchmarkParam::Create<bool>(false));
   return params;
 }
 
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index 161ae1df034..eddaedf9279 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -26,6 +26,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/op_resolver.h"
+#include "tensorflow/lite/profiling/buffered_profiler.h"
+#include "tensorflow/lite/profiling/profile_summarizer.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/tools/benchmark/logging.h"
 #include "tensorflow/lite/tools/evaluation/utils.h"
@@ -40,12 +42,44 @@ void RegisterSelectedOps(::tflite::MutableOpResolver* resolver);
 
 namespace tflite {
 namespace benchmark {
+namespace {
 
-void ProfilingListener::SetInterpreter(tflite::Interpreter* interpreter) {
-  TFLITE_BENCHMARK_CHECK(interpreter);
-  interpreter_ = interpreter;
-  interpreter_->SetProfiler(&profiler_);
-}
+// Backward compat with previous approach to enabling op profiling.
+#if defined(TFLITE_PROFILING_ENABLED)
+constexpr int kOpProfilingEnabledDefault = true;
+#else
+constexpr int kOpProfilingEnabledDefault = false;
+#endif
+
+// Dumps profiling events if profiling is enabled.
+class ProfilingListener : public BenchmarkListener {
+ public:
+  explicit ProfilingListener(Interpreter* interpreter)
+      : interpreter_(interpreter), has_profiles_(false) {
+    TFLITE_BENCHMARK_CHECK(interpreter);
+    interpreter_->SetProfiler(&profiler_);
+  }
+
+  void OnSingleRunStart(RunType run_type) override;
+
+  void OnSingleRunEnd() override;
+
+  void OnBenchmarkEnd(const BenchmarkResults& results) override;
+
+ private:
+  Interpreter* interpreter_;
+  profiling::BufferedProfiler profiler_;
+  profiling::ProfileSummarizer summarizer_;
+  bool has_profiles_;
+};
+
+// Dumps gemmlowp profiling events if gemmlowp profiling is enabled.
+class GemmlowpProfilingListener : public BenchmarkListener {
+ public:
+  void OnBenchmarkStart(const BenchmarkParams& params) override;
+
+  void OnBenchmarkEnd(const BenchmarkResults& results) override;
+};
 
 void ProfilingListener::OnSingleRunStart(RunType run_type) {
   if (run_type == REGULAR) {
@@ -82,8 +116,6 @@ void GemmlowpProfilingListener::OnBenchmarkEnd(
 #endif
 }
 
-namespace {
-
 std::vector<std::string> Split(const std::string& str, const char delim) {
   std::istringstream input(str);
   std::vector<std::string> results;
@@ -201,6 +233,9 @@ BenchmarkParams BenchmarkTfLiteModel::DefaultParams() {
                           BenchmarkParam::Create<bool>(false));
   default_params.AddParam("use_gpu", BenchmarkParam::Create<bool>(false));
   default_params.AddParam("allow_fp16", BenchmarkParam::Create<bool>(false));
+  default_params.AddParam(
+      "enable_op_profiling",
+      BenchmarkParam::Create<bool>(kOpProfilingEnabledDefault));
   return default_params;
 }
 
@@ -209,8 +244,6 @@ BenchmarkTfLiteModel::BenchmarkTfLiteModel()
 
 BenchmarkTfLiteModel::BenchmarkTfLiteModel(BenchmarkParams params)
     : BenchmarkModel(std::move(params)) {
-  AddListener(&profiling_listener_);
-  AddListener(&gemmlowp_profiling_listener_);
 }
 
 void BenchmarkTfLiteModel::CleanUp() {
@@ -236,7 +269,8 @@ std::vector<Flag> BenchmarkTfLiteModel::GetFlags() {
       CreateFlag<bool>("use_nnapi", &params_, "use nnapi delegate api"),
       CreateFlag<bool>("use_legacy_nnapi", &params_, "use legacy nnapi api"),
       CreateFlag<bool>("use_gpu", &params_, "use gpu"),
-      CreateFlag<bool>("allow_fp16", &params_, "allow fp16")};
+      CreateFlag<bool>("allow_fp16", &params_, "allow fp16"),
+      CreateFlag<bool>("enable_op_profiling", &params_, "enable op profiling")};
 
   flags.insert(flags.end(), specific_flags.begin(), specific_flags.end());
   return flags;
@@ -255,6 +289,8 @@ void BenchmarkTfLiteModel::LogParams() {
   TFLITE_LOG(INFO) << "Use gpu : [" << params_.Get<bool>("use_gpu") << "]";
   TFLITE_LOG(INFO) << "Allow fp16 : [" << params_.Get<bool>("allow_fp16")
                    << "]";
+  TFLITE_LOG(INFO) << "Enable op profiling: ["
+                   << params_.Get<bool>("enable_op_profiling") << "]";
 }
 
 bool BenchmarkTfLiteModel::ValidateParams() {
@@ -306,6 +342,12 @@ void BenchmarkTfLiteModel::PrepareInputData() {
       FillRandomValue<int32_t>(t_data.data.i32, num_elements, []() {
         return static_cast<int32_t>(rand()) % 100;
       });
+    } else if (t->type == kTfLiteInt16) {
+      t_data.bytes = sizeof(int16_t) * num_elements;
+      t_data.data.raw = new char[t_data.bytes];
+      FillRandomValue<int16_t>(t_data.data.i16, num_elements, []() {
+        return static_cast<int16_t>(rand()) % 100;
+      });
     } else if (t->type == kTfLiteUInt8) {
       t_data.bytes = sizeof(uint8_t) * num_elements;
       t_data.data.raw = new char[t_data.bytes];
@@ -340,6 +382,9 @@ void BenchmarkTfLiteModel::ResetInputsAndOutputs() {
     } else if (t->type == kTfLiteInt32) {
       std::memcpy(interpreter->typed_tensor<int32_t>(i),
                   inputs_data_[j].data.i32, inputs_data_[j].bytes);
+    } else if (t->type == kTfLiteInt16) {
+      std::memcpy(interpreter->typed_tensor<int16_t>(i),
+                  inputs_data_[j].data.i16, inputs_data_[j].bytes);
     } else if (t->type == kTfLiteUInt8) {
       std::memcpy(interpreter->typed_tensor<uint8_t>(i),
                   inputs_data_[j].data.uint8, inputs_data_[j].bytes);
@@ -382,7 +427,6 @@ void BenchmarkTfLiteModel::Init() {
   if (!interpreter) {
     TFLITE_LOG(FATAL) << "Failed to construct interpreter";
   }
-  profiling_listener_.SetInterpreter(interpreter.get());
 
   interpreter->UseNNAPI(params_.Get<bool>("use_legacy_nnapi"));
 
@@ -433,6 +477,16 @@ void BenchmarkTfLiteModel::Init() {
   if (delegates_.empty() && interpreter->AllocateTensors() != kTfLiteOk) {
     TFLITE_LOG(FATAL) << "Failed to allocate tensors!";
   }
+
+  // Install profilers if necessary.
+  if (params_.Get<bool>("enable_op_profiling")) {
+    profiling_listener_.reset(new ProfilingListener(interpreter.get()));
+    AddListener(profiling_listener_.get());
+  }
+#ifdef GEMMLOWP_PROFILING
+  gemmlowp_profiling_listener_.reset(new GemmlowpProfilingListener());
+  AddListener(gemmlowp_profiling_listener_.get());
+#endif
 }
 
 BenchmarkTfLiteModel::TfLiteDelegatePtrMap BenchmarkTfLiteModel::GetDelegates()
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
index 99b9ce35246..dd0bec108eb 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
@@ -22,42 +22,12 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/lite/model.h"
-#include "tensorflow/lite/profiling/profile_summarizer.h"
+#include "tensorflow/lite/profiling/profiler.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_model.h"
 
 namespace tflite {
 namespace benchmark {
 
-// Dumps profiling events if profiling is enabled.
-class ProfilingListener : public BenchmarkListener {
- public:
-  explicit ProfilingListener() : interpreter_(nullptr), has_profiles_(false) {}
-
-  void SetInterpreter(Interpreter* interpreter);
-
-  void OnSingleRunStart(RunType run_type) override;
-
-  void OnSingleRunEnd() override;
-
-  void OnBenchmarkEnd(const BenchmarkResults& results) override;
-
- private:
-  Interpreter* interpreter_;
-  profiling::Profiler profiler_;
-  profiling::ProfileSummarizer summarizer_;
-  bool has_profiles_;
-};
-
-// Dumps gemmlowp profiling events if gemmlowp profiling is enabled.
-class GemmlowpProfilingListener : public BenchmarkListener {
- public:
-  virtual ~GemmlowpProfilingListener() {}
-
-  void OnBenchmarkStart(const BenchmarkParams& params) override;
-
-  void OnBenchmarkEnd(const BenchmarkResults& results) override;
-};
-
 // Benchmarks a TFLite model by running tflite interpreter.
 class BenchmarkTfLiteModel : public BenchmarkModel {
  public:
@@ -99,8 +69,8 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
   };
   std::vector<InputLayerInfo> inputs;
   std::vector<InputTensorData> inputs_data_;
-  ProfilingListener profiling_listener_;
-  GemmlowpProfilingListener gemmlowp_profiling_listener_;
+  std::unique_ptr<BenchmarkListener> profiling_listener_;
+  std::unique_ptr<BenchmarkListener> gemmlowp_profiling_listener_;
   TfLiteDelegatePtrMap delegates_;
 };
 
diff --git a/tensorflow/lite/tools/make/build_ios_universal_lib.sh b/tensorflow/lite/tools/make/build_ios_universal_lib.sh
index 8b617ef5937..3678f554d08 100755
--- a/tensorflow/lite/tools/make/build_ios_universal_lib.sh
+++ b/tensorflow/lite/tools/make/build_ios_universal_lib.sh
@@ -32,7 +32,7 @@ BUILD_ARCHS="x86_64 armv7 armv7s arm64"
 while getopts "a:p" opt_name; do
   case "$opt_name" in
     a) BUILD_ARCHS="${OPTARG}";;
-    p) profiling_args='-DGEMMLOWP_PROFILING,-DTFLITE_PROFILING_ENABLED';;
+    p) profiling_args='-DGEMMLOWP_PROFILING';;
     *) usage;;
   esac
 done
diff --git a/tensorflow/lite/tools/optimize/model_utils.cc b/tensorflow/lite/tools/optimize/model_utils.cc
index e0755e45c96..de6bf009363 100644
--- a/tensorflow/lite/tools/optimize/model_utils.cc
+++ b/tensorflow/lite/tools/optimize/model_utils.cc
@@ -130,7 +130,7 @@ void SetOperatorCodeVersion(ModelT* model) {
     operator_property::OperatorProperty property =
         operator_property::GetOperatorProperty(op_buildin_code);
     if (property.quantizable) {
-      // Only update the versions of non-quantizable operations.
+      // Only update the versions of quantizable operations.
       op_code->version = property.version;
     }
   }
diff --git a/tensorflow/lite/tools/optimize/quantize_model.cc b/tensorflow/lite/tools/optimize/quantize_model.cc
index 058dc316d2f..87454630026 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model.cc
@@ -260,8 +260,7 @@ TfLiteStatus SetInputAndOutputTypes(ModelT* model, const TensorType& input_type,
 // outpus must have the same scale and zero point. The other ones with
 // constraints(averagepool, maxpool, gather, softmax, tanh etc) are handled in
 // QuantizeWeightsAndInput.
-TfLiteStatus ApplyConstraints(flatbuffers::FlatBufferBuilder* builder,
-                              ModelT* model, ErrorReporter* error_reporter) {
+TfLiteStatus ApplyConstraints(ModelT* model, ErrorReporter* error_reporter) {
   for (int subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
        subgraph_idx++) {
     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
@@ -540,8 +539,7 @@ TfLiteStatus QuantizeOpOutput(ModelT* model, int32_t subgraph_idx,
 
 // Quantize inputs and weights.
 // Because of ops such as lstm, still need to do per op, instead of weights.
-TfLiteStatus QuantizeWeightsInputOutput(flatbuffers::FlatBufferBuilder* builder,
-                                        ModelT* model, bool allow_float,
+TfLiteStatus QuantizeWeightsInputOutput(ModelT* model, bool allow_float,
                                         ErrorReporter* error_reporter) {
   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
        subgraph_idx++) {
@@ -576,8 +574,7 @@ TfLiteStatus QuantizeWeightsInputOutput(flatbuffers::FlatBufferBuilder* builder,
 }
 
 // Quantize bias.
-TfLiteStatus QuantizeBiases(flatbuffers::FlatBufferBuilder* builder,
-                            ModelT* model, ErrorReporter* error_reporter) {
+TfLiteStatus QuantizeBiases(ModelT* model, ErrorReporter* error_reporter) {
   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
        subgraph_idx++) {
     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
@@ -636,9 +633,9 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            const TensorType& output_type, bool allow_float,
                            ErrorReporter* error_reporter) {
   TF_LITE_ENSURE_STATUS(
-      QuantizeWeightsInputOutput(builder, model, allow_float, error_reporter));
-  TF_LITE_ENSURE_STATUS(ApplyConstraints(builder, model, error_reporter));
-  TF_LITE_ENSURE_STATUS(QuantizeBiases(builder, model, error_reporter));
+      QuantizeWeightsInputOutput(model, allow_float, error_reporter));
+  TF_LITE_ENSURE_STATUS(ApplyConstraints(model, error_reporter));
+  TF_LITE_ENSURE_STATUS(QuantizeBiases(model, error_reporter));
   utils::SetOperatorCodeVersion(model);
   TF_LITE_ENSURE_STATUS(
       SetInputAndOutputTypes(model, input_type, output_type, error_reporter));
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index 74b030d9439..b19a3c05973 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -5,6 +5,7 @@ tensorflow/contrib/tpu/profiler/pip_package/build_pip_package.sh
 tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
 tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/__init__.py
 tensorflow/contrib/mpi/BUILD
+tensorflow/stream_executor/build_defs.bzl
 tensorflow/tools/ci_build/remote/BUILD
 tensorflow/tools/pip_package/README
 tensorflow/tools/pip_package/MANIFEST.in
@@ -257,7 +258,6 @@ tensorflow/third_party/__init__.py
 tensorflow/third_party/libxsmm.BUILD
 tensorflow/third_party/zlib.BUILD
 tensorflow/third_party/eigen.BUILD
-tensorflow/stream_executor/build_defs.bzl
 tensorflow/api_template_v1.__init__.py
 tensorflow/compat_template_v1.__init__.py
 tensorflow/compat_template.__init__.py
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 72a81f6742e..0d478ea95e0 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -1420,9 +1420,7 @@ tf_py_test(
     ],
     main = "platform/benchmark_test.py",
     tags = [
-        "manual",
         "no_pip",
-        "notap",
     ],
 )
 
diff --git a/tensorflow/python/autograph/converters/continue_statements.py b/tensorflow/python/autograph/converters/continue_statements.py
index dc1e8c8cb0f..028017d1644 100644
--- a/tensorflow/python/autograph/converters/continue_statements.py
+++ b/tensorflow/python/autograph/converters/continue_statements.py
@@ -46,9 +46,11 @@ class _Block(object):
       continue statement.
     create_guard: bool, whether a guard should be created because a continue
       statement has just been encountered.
+    is_loop_type: bool, whether this block is the body of a loop.
   """
 
   def __init__(self):
+    self.is_loop_type = False
     self.reset_guard_state()
 
   def reset_guard_state(self):
@@ -61,7 +63,13 @@ class ContinueCanonicalizationTransformer(converter.Base):
 
   def visit_Continue(self, node):
     self.state[_Continue].used = True
-    self.state[_Block].reset_guard_state()
+    for block in reversed(self.state[_Block].stack):
+      block.reset_guard_state()
+      # See ContinueCanonicalizationTest.test_multiple_continues for an example
+      # it's necessary to reset the state of all enclosing affected blocks, not
+      # just that of the current block.
+      if block.is_loop_type:
+        break
     template = """
       var_name = True
     """
@@ -112,6 +120,7 @@ class ContinueCanonicalizationTransformer(converter.Base):
   def _visit_loop_body(self, node, nodes):
     self.state[_Continue].enter()
     self.state[_Block].enter()
+    self.state[_Block].is_loop_type = True
     scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
     continue_var = self.ctx.namer.new_symbol('continue_', scope.referenced)
     self.state[_Continue].control_var_name = continue_var
diff --git a/tensorflow/python/autograph/converters/continue_statements_test.py b/tensorflow/python/autograph/converters/continue_statements_test.py
index 0dbc8e78b93..8891d8167d3 100644
--- a/tensorflow/python/autograph/converters/continue_statements_test.py
+++ b/tensorflow/python/autograph/converters/continue_statements_test.py
@@ -54,10 +54,10 @@ class ContinueCanonicalizationTest(converter_testing.TestCase):
       v = []
       while x > 0:
         x -= 1
-        if x > 2:
-          continue
         if x > 1:
           continue
+        if x > 2:
+          continue
         v.append(x)
       return v
 
@@ -66,6 +66,26 @@ class ContinueCanonicalizationTest(converter_testing.TestCase):
     self.assertTransformedEquivalent(test_fn, 3)
     self.assertTransformedEquivalent(test_fn, 4)
 
+  def test_multiple_continues_in_nested_scope(self):
+
+    def test_fn(a):
+      v = []
+      for x in a:
+        x -= 1
+        if x > 100:
+          continue
+        try:
+          raise ValueError('intentional')
+        except ValueError:
+          continue
+        v.append(x)
+      return v
+
+    self.assertTransformedEquivalent(test_fn, [])
+    self.assertTransformedEquivalent(test_fn, [1])
+    self.assertTransformedEquivalent(test_fn, [2])
+    self.assertTransformedEquivalent(test_fn, [1, 2, 3])
+
   def test_for_loop(self):
 
     def test_fn(a):
diff --git a/tensorflow/python/autograph/converters/control_flow.py b/tensorflow/python/autograph/converters/control_flow.py
index 218408743a0..21c3d18ed85 100644
--- a/tensorflow/python/autograph/converters/control_flow.py
+++ b/tensorflow/python/autograph/converters/control_flow.py
@@ -483,7 +483,7 @@ class ControlFlowTransformer(converter.Base):
             ssf_map)
     else:
       # Loop with no loop-carried state and no early stopping
-      assert not has_extra_test, ('Early stoppiong (e.g. break and/or return) '
+      assert not has_extra_test, ('Early stopping (e.g. break and/or return) '
                                   'should create state variables.')
       loop_nodes = self._for_loop_without_state(node, body_name, node_body)
 
diff --git a/tensorflow/python/autograph/converters/return_statements.py b/tensorflow/python/autograph/converters/return_statements.py
index 3173e676e5d..a53206c867d 100644
--- a/tensorflow/python/autograph/converters/return_statements.py
+++ b/tensorflow/python/autograph/converters/return_statements.py
@@ -375,8 +375,10 @@ class ReturnStatementsTransformer(converter.Base):
       if self.default_to_null_return:
         template = """
           do_return_var_name = False
-          retval_var_name = None
+          retval_var_name = ag__.UndefinedReturnValue()
           body
+          if ag__.is_undefined_return(retval_var_name):
+            retval_var_name = None
           return retval_var_name
         """
       else:
diff --git a/tensorflow/python/autograph/converters/return_statements_test.py b/tensorflow/python/autograph/converters/return_statements_test.py
index b2d3d1b9205..869731175d3 100644
--- a/tensorflow/python/autograph/converters/return_statements_test.py
+++ b/tensorflow/python/autograph/converters/return_statements_test.py
@@ -49,7 +49,7 @@ class SingleReturnTest(converter_testing.TestCase):
     self.assertTransformedEquivalent(test_fn, 2)
     self.assertTransformedEquivalent(test_fn, -2)
 
-  def test_missing_else(self):
+  def test_contitional_missing_else(self):
 
     def test_fn(x):
       if x > 0:
@@ -58,7 +58,7 @@ class SingleReturnTest(converter_testing.TestCase):
     self.assertTransformedEquivalent(test_fn, 2)
     self.assertTransformedEquivalent(test_fn, -2)
 
-  def test_missing_else_then_default(self):
+  def test_conditional_missing_else_then_default(self):
 
     def test_fn(x):
       if x > 0:
@@ -68,7 +68,7 @@ class SingleReturnTest(converter_testing.TestCase):
     self.assertTransformedEquivalent(test_fn, 2)
     self.assertTransformedEquivalent(test_fn, -2)
 
-  def test_else_only_then_default(self):
+  def test_conditional_else_only_then_default(self):
 
     def test_fn(x):
       if x < 0:
@@ -216,6 +216,25 @@ class SingleReturnTest(converter_testing.TestCase):
     self.assertTransformedEquivalent(test_fn, 3)
     self.assertTransformedEquivalent(test_fn, 4)
 
+  def test_multiple_returns_in_nested_scope(self):
+
+    def test_fn(a):
+      v = []
+      for x in a:
+        x -= 1
+        if x > 100:
+          return v
+        try:
+          raise ValueError('intentional')
+        except ValueError:  # pylint:disable=bare-except
+          return v
+        v.append(x)
+      return v
+
+    self.assertTransformedEquivalent(test_fn, [])
+    self.assertTransformedEquivalent(test_fn, [1])
+    self.assertTransformedEquivalent(test_fn, [2])
+    self.assertTransformedEquivalent(test_fn, [1, 2, 3])
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/impl/api.py b/tensorflow/python/autograph/impl/api.py
index 98264bcd22b..df75aa089b3 100644
--- a/tensorflow/python/autograph/impl/api.py
+++ b/tensorflow/python/autograph/impl/api.py
@@ -24,6 +24,7 @@ import functools
 import inspect
 import os
 import pdb
+import re
 import sys
 import textwrap
 
@@ -256,7 +257,8 @@ def converted_call(f, owner, options, args, kwargs):
 
   # Other built-in modules are permanently whitelisted.
   # TODO(mdan): Figure out how to do this consistently for all stdlib modules.
-  if any(f in m.__dict__.values() for m in (collections, pdb, copy, inspect)):
+  if any(
+      f in m.__dict__.values() for m in (collections, pdb, copy, inspect, re)):
     logging.log(2, 'Permanently whitelisted: %s: part of builtin module', f)
     return _call_unconverted(f, args, kwargs)
 
diff --git a/tensorflow/python/autograph/impl/api_test.py b/tensorflow/python/autograph/impl/api_test.py
index 7bd351f2786..d65a4ed8f60 100644
--- a/tensorflow/python/autograph/impl/api_test.py
+++ b/tensorflow/python/autograph/impl/api_test.py
@@ -23,6 +23,7 @@ import functools
 import gc
 import imp
 import os
+import re
 import textwrap
 import types
 
@@ -204,7 +205,8 @@ class ApiTest(test.TestCase):
       def test_method(self, x, s, a):
         while tf.reduce_sum(x) > s:
           x //= api.converted_call(self.called_member, None,
-                                   converter.ConversionOptions(), (a,), {})
+                                   converter.ConversionOptions(recursive=True),
+                                   (a,), {})
         return x
 
     tc = TestClass()
@@ -214,9 +216,16 @@ class ApiTest(test.TestCase):
     self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
   def test_converted_call_builtin(self):
-    x = api.converted_call(range, None, converter.ConversionOptions(), (3,), {})
+    x = api.converted_call(range, None,
+                           converter.ConversionOptions(recursive=True), (3,),
+                           {})
     self.assertEqual((0, 1, 2), tuple(x))
 
+    x = api.converted_call('compile', re,
+                           converter.ConversionOptions(recursive=True),
+                           ('mnas_v4_a.*\\/.*(weights|kernel):0$',), {})
+    self.assertIsNotNone(x.match('mnas_v4_a/weights:0'))
+
   def test_converted_call_function(self):
 
     def test_fn(x):
@@ -224,7 +233,8 @@ class ApiTest(test.TestCase):
         return -x
       return x
 
-    x = api.converted_call(test_fn, None, converter.ConversionOptions(),
+    x = api.converted_call(test_fn, None,
+                           converter.ConversionOptions(recursive=True),
                            (constant_op.constant(-1),), {})
     self.assertEqual(1, self.evaluate(x))
 
@@ -238,13 +248,15 @@ class ApiTest(test.TestCase):
 
     x = api.converted_call(
         functools.partial(test_fn, constant_op.constant(-1), z=-3), None,
-        converter.ConversionOptions(), (constant_op.constant(-2),), {})
+        converter.ConversionOptions(recursive=True),
+        (constant_op.constant(-2),), {})
     self.assertEqual((1, 2, 3), self.evaluate(x))
 
     x = api.converted_call(
         functools.partial(
             functools.partial(test_fn, constant_op.constant(-1)), z=-3), None,
-        converter.ConversionOptions(), (constant_op.constant(-2),), {})
+        converter.ConversionOptions(recursive=True),
+        (constant_op.constant(-2),), {})
     self.assertEqual((1, 2, 3), self.evaluate(x))
 
   def test_converted_call_method_explicit_owner(self):
@@ -268,8 +280,8 @@ class ApiTest(test.TestCase):
         return self.x
 
     tc = TestClass(constant_op.constant(-1))
-    x = api.converted_call(tc.test_method, None, converter.ConversionOptions(),
-                           (), {})
+    x = api.converted_call(tc.test_method, None,
+                           converter.ConversionOptions(recursive=True), (), {})
     self.assertEqual(1, self.evaluate(x))
 
   def test_converted_call_synthetic_method(self):
@@ -287,8 +299,8 @@ class ApiTest(test.TestCase):
     tc = TestClass(constant_op.constant(-1))
     test_method = types.MethodType(test_function, tc)
 
-    x = api.converted_call(test_method, None, converter.ConversionOptions(),
-                           (), {})
+    x = api.converted_call(test_method, None,
+                           converter.ConversionOptions(recursive=True), (), {})
     self.assertEqual(1, self.evaluate(x))
 
   def test_converted_call_method_wrapper(self):
@@ -301,8 +313,9 @@ class ApiTest(test.TestCase):
     tc = TestClass()
 
     # `method.__get__()` returns a so-called method-wrapper.
-    wrapper = api.converted_call(
-        '__get__', tc.foo, converter.ConversionOptions(), (tc,), {})
+    wrapper = api.converted_call('__get__', tc.foo,
+                                 converter.ConversionOptions(recursive=True),
+                                 (tc,), {})
     self.assertEqual(wrapper, tc.foo)
 
   def test_converted_call_method_as_object_attribute(self):
@@ -326,7 +339,7 @@ class ApiTest(test.TestCase):
     tc = TestClass(obj.method)
 
     x = api.converted_call('another_obj_method', tc,
-                           converter.ConversionOptions(), (), {})
+                           converter.ConversionOptions(recursive=True), (), {})
     self.assertEqual(self.evaluate(x), 2)
 
   def test_converted_call_method_converts_recursively(self):
@@ -363,7 +376,8 @@ class ApiTest(test.TestCase):
 
     tc = TestClass(constant_op.constant(-1))
     x = api.converted_call(TestClass.test_method, None,
-                           converter.ConversionOptions(), (tc,), {})
+                           converter.ConversionOptions(recursive=True), (tc,),
+                           {})
     self.assertEqual(1, self.evaluate(x))
 
   def test_converted_call_callable_object(self):
@@ -379,7 +393,8 @@ class ApiTest(test.TestCase):
         return self.x
 
     tc = TestClass(constant_op.constant(-1))
-    x = api.converted_call(tc, None, converter.ConversionOptions(), (), {})
+    x = api.converted_call(tc, None,
+                           converter.ConversionOptions(recursive=True), (), {})
     self.assertEqual(1, self.evaluate(x))
 
   @test_util.run_deprecated_v1
@@ -395,7 +410,8 @@ class ApiTest(test.TestCase):
           return -self.x
         return self.x
 
-    tc = api.converted_call(TestClass, None, converter.ConversionOptions(),
+    tc = api.converted_call(TestClass, None,
+                            converter.ConversionOptions(recursive=True),
                             (constant_op.constant(-1),), {})
     # tc is still a TestClass - constructors are whitelisted.
     # TODO(b/124016764): Support this use case.
@@ -409,13 +425,14 @@ class ApiTest(test.TestCase):
     def f(x):
       return x == 0
 
-    x = api.converted_call(f, None, converter.ConversionOptions(),
+    x = api.converted_call(f, None, converter.ConversionOptions(recursive=True),
                            (constant_op.constant(0),), {})
     self.assertTrue(self.evaluate(x))
 
     converted_f = api.to_graph(
         f, experimental_optional_features=converter.Feature.ALL)
-    x = api.converted_call(converted_f, None, converter.ConversionOptions(),
+    x = api.converted_call(converted_f, None,
+                           converter.ConversionOptions(recursive=True),
                            (constant_op.constant(0),), {})
     self.assertTrue(self.evaluate(x))
 
@@ -431,7 +448,7 @@ class ApiTest(test.TestCase):
     def f(g, x):
       return g(x)
 
-    x = api.converted_call(f, None, converter.ConversionOptions(),
+    x = api.converted_call(f, None, converter.ConversionOptions(recursive=True),
                            (g, constant_op.constant(1)), {})
     self.assertEqual(self.evaluate(x), 1)
 
@@ -465,7 +482,7 @@ class ApiTest(test.TestCase):
 
   def test_converted_call_whitelisted_method(self):
 
-    opts = converter.ConversionOptions()
+    opts = converter.ConversionOptions(recursive=True)
 
     model = sequential.Sequential([
         core.Dense(2)
@@ -479,7 +496,7 @@ class ApiTest(test.TestCase):
 
   def test_converted_call_whitelisted_method_via_owner(self):
 
-    opts = converter.ConversionOptions()
+    opts = converter.ConversionOptions(recursive=True)
 
     model = sequential.Sequential([
         core.Dense(2)
@@ -493,7 +510,7 @@ class ApiTest(test.TestCase):
 
   def test_converted_call_numpy(self):
 
-    opts = converter.ConversionOptions()
+    opts = converter.ConversionOptions(recursive=True)
 
     x = api.converted_call(np.arange, None, opts, (5,), {})
 
@@ -525,7 +542,7 @@ class ApiTest(test.TestCase):
 
   def test_converted_call_namedtuple(self):
 
-    opts = converter.ConversionOptions()
+    opts = converter.ConversionOptions(recursive=True)
 
     x = api.converted_call(collections.namedtuple, None, opts,
                            ('TestNamedtuple', ('a', 'b')), {})
@@ -534,7 +551,7 @@ class ApiTest(test.TestCase):
 
   def test_converted_call_namedtuple_via_collections(self):
 
-    opts = converter.ConversionOptions()
+    opts = converter.ConversionOptions(recursive=True)
 
     x = api.converted_call('namedtuple', collections, opts, ('TestNamedtuple',
                                                              ('a', 'b')), {})
@@ -543,7 +560,7 @@ class ApiTest(test.TestCase):
 
   def test_converted_call_lambda(self):
 
-    opts = converter.ConversionOptions()
+    opts = converter.ConversionOptions(recursive=True)
 
     l = lambda x: x == 0
 
@@ -554,7 +571,7 @@ class ApiTest(test.TestCase):
 
   def test_converted_call_defun_object_method(self):
 
-    opts = converter.ConversionOptions()
+    opts = converter.ConversionOptions(recursive=True)
 
     # pylint:disable=method-hidden
     class TestClass(object):
@@ -590,7 +607,7 @@ class ApiTest(test.TestCase):
       def f(y):
         return res.x + y
 
-      opts = converter.ConversionOptions()
+      opts = converter.ConversionOptions(recursive=True)
       api.converted_call(f, None, opts, (1,), {})
 
     self.assertNoMemoryLeaks(test_fn)
@@ -607,7 +624,7 @@ class ApiTest(test.TestCase):
 
         return inner_f
 
-      opts = converter.ConversionOptions()
+      opts = converter.ConversionOptions(recursive=True)
       api.converted_call(f, None, opts, (1,), {})()
 
     self.assertNoMemoryLeaks(test_fn)
diff --git a/tensorflow/python/autograph/impl/conversion.py b/tensorflow/python/autograph/impl/conversion.py
index ee472ab5167..6979a1a4847 100644
--- a/tensorflow/python/autograph/impl/conversion.py
+++ b/tensorflow/python/autograph/impl/conversion.py
@@ -335,16 +335,10 @@ def is_whitelisted_for_graph(o, check_call_override=True):
   if hasattr(m, '__name__'):
     # Builtins typically have unnamed modules.
     for prefix, in config.DEFAULT_UNCOMPILED_MODULES:
-      if m.__name__.startswith(prefix):
+      if m.__name__.startswith(prefix + '.') or m.__name__ == prefix:
         logging.log(2, 'Whitelisted: %s: name starts with "%s"', o, prefix)
         return True
 
-    # Temporary -- whitelist tensorboard modules.
-    # TODO(b/122731813): Remove.
-    if m.__name__ == 'tensorboard' or '.tensorboard' in m.__name__:
-      logging.log(2, 'Whitelisted: %s: name contains "tensorboard"', o)
-      return True
-
   if hasattr(o, 'autograph_info__') or hasattr(o, '__ag_compiled'):
     logging.log(2, 'Whitelisted: %s: already converted', o)
     return True
diff --git a/tensorflow/python/autograph/impl/conversion_test.py b/tensorflow/python/autograph/impl/conversion_test.py
index bdbdb87e7b0..02efb788e98 100644
--- a/tensorflow/python/autograph/impl/conversion_test.py
+++ b/tensorflow/python/autograph/impl/conversion_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import imp
 import gast
 
 from tensorflow.python.autograph import utils
@@ -46,6 +47,16 @@ class ConversionTest(test.TestCase):
     self.assertTrue(conversion.is_whitelisted_for_graph(utils))
     self.assertTrue(conversion.is_whitelisted_for_graph(constant_op.constant))
 
+  def test_is_whitelisted_for_graph_tensorflow_like(self):
+
+    tf_like = imp.new_module('tensorflow_foo')
+    def test_fn():
+      pass
+    tf_like.test_fn = test_fn
+    test_fn.__module__ = tf_like
+
+    self.assertFalse(conversion.is_whitelisted_for_graph(tf_like.test_fn))
+
   def test_convert_entity_to_ast_unsupported_types(self):
     with self.assertRaises(NotImplementedError):
       program_ctx = self._simple_program_ctx()
diff --git a/tensorflow/python/autograph/operators/__init__.py b/tensorflow/python/autograph/operators/__init__.py
index 5b3f45de056..bbc684eaf2b 100644
--- a/tensorflow/python/autograph/operators/__init__.py
+++ b/tensorflow/python/autograph/operators/__init__.py
@@ -72,4 +72,6 @@ from tensorflow.python.autograph.operators.slices import get_item
 from tensorflow.python.autograph.operators.slices import GetItemOpts
 from tensorflow.python.autograph.operators.slices import set_item
 from tensorflow.python.autograph.operators.special_values import is_undefined
+from tensorflow.python.autograph.operators.special_values import is_undefined_return
 from tensorflow.python.autograph.operators.special_values import Undefined
+from tensorflow.python.autograph.operators.special_values import UndefinedReturnValue
diff --git a/tensorflow/python/autograph/operators/control_flow.py b/tensorflow/python/autograph/operators/control_flow.py
index d1428bb524a..5575b4c1911 100644
--- a/tensorflow/python/autograph/operators/control_flow.py
+++ b/tensorflow/python/autograph/operators/control_flow.py
@@ -42,6 +42,7 @@ INEFFICIENT_UNROLL_MIN_OPS = 1
 
 
 def _disallow_undefs_into_loop(*values):
+  """Ensures that all values in the state are defined when entering a loop."""
   undefined = tuple(filter(special_values.is_undefined, values))
   if undefined:
     raise ValueError(
@@ -49,6 +50,14 @@ def _disallow_undefs_into_loop(*values):
         ' before the loop: {}'.format(
             tuple(s.symbol_name for s in undefined)))
 
+  for value in values:
+    if special_values.is_undefined_return(value):
+      # Assumption: the loop will only capture the variable which tracks the
+      # return value if the loop contained a return statement.
+      # TODO(mdan): This should be checked at the place where return occurs.
+      raise ValueError(
+          'Return statements are not supported within a TensorFlow loop.')
+
 
 def for_stmt(iter_, extra_test, body, init_state):
   """Functional form of a for statement.
@@ -435,8 +444,8 @@ def if_stmt(cond, body, orelse, get_state, set_state):
 
 def tf_if_stmt(cond, body, orelse, get_state, set_state):
   """Overload of if_stmt that stages a TF cond."""
-  body = _wrap_disallow_undefs_in_cond(body, branch_name='if')
-  orelse = _wrap_disallow_undefs_in_cond(orelse, branch_name='else')
+  body = _wrap_disallow_undefs_from_cond(body, branch_name='if')
+  orelse = _wrap_disallow_undefs_from_cond(orelse, branch_name='else')
   body = _isolate_state(body, get_state, set_state)
   orelse = _isolate_state(orelse, get_state, set_state)
 
@@ -484,7 +493,7 @@ def _isolate_state(func, get_state, set_state):
   return wrapper
 
 
-def _wrap_disallow_undefs_in_cond(func, branch_name):
+def _wrap_disallow_undefs_from_cond(func, branch_name):
   """Wraps conditional branch to disallow returning undefined symbols."""
 
   def wrapper():
@@ -503,6 +512,13 @@ def _wrap_disallow_undefs_in_cond(func, branch_name):
           ' statement.'.format(branch_name,
                                tuple(s.symbol_name for s in undefined)))
 
+    for result in results_tuple:
+      if special_values.is_undefined_return(result):
+        raise ValueError(
+            'A value must also be returned from the {} branch. If a value is '
+            'returned from one branch of a conditional a value must be '
+            'returned from all branches.'.format(branch_name))
+
     return results
 
   return wrapper
diff --git a/tensorflow/python/autograph/operators/slices.py b/tensorflow/python/autograph/operators/slices.py
index 2b7f5ad9226..af4074cc55a 100644
--- a/tensorflow/python/autograph/operators/slices.py
+++ b/tensorflow/python/autograph/operators/slices.py
@@ -22,6 +22,7 @@ import collections
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_string_ops
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import tensor_array_ops
@@ -119,9 +120,7 @@ def set_item(target, i, x):
     if target.dtype == dtypes.variant:
       return _tf_tensor_list_set_item(target, i, x)
     else:
-      raise ValueError(
-          'tensor lists are expected to be Tensors with dtype=tf.variant,'
-          ' instead found %s' % target)
+      return _tf_tensor_set_item(target, i, x)
   else:
     return _py_set_item(target, i, x)
 
@@ -136,6 +135,11 @@ def _tf_tensor_list_set_item(target, i, x):
   return list_ops.tensor_list_set_item(target, i, x)
 
 
+def _tf_tensor_set_item(target, i, x):
+  """Overload of set_item that stages a Tensor scatter update."""
+  return gen_array_ops.tensor_scatter_update(target, ((i,),), (x,))
+
+
 def _py_set_item(target, i, x):
   """Overload of set_item that executes a Python list modification."""
   target[i] = x
diff --git a/tensorflow/python/autograph/operators/special_values.py b/tensorflow/python/autograph/operators/special_values.py
index 13d846fc7cf..a41f516e550 100644
--- a/tensorflow/python/autograph/operators/special_values.py
+++ b/tensorflow/python/autograph/operators/special_values.py
@@ -64,3 +64,13 @@ def is_undefined(value):
     Boolean, whether the input value is undefined.
   """
   return isinstance(value, Undefined)
+
+
+class UndefinedReturnValue(object):
+  """Represents a default return value from a function (None in Python)."""
+  pass
+
+
+def is_undefined_return(value):
+  """Checks whether `value` is the default return value."""
+  return isinstance(value, UndefinedReturnValue)
diff --git a/tensorflow/python/autograph/pyct/transformer.py b/tensorflow/python/autograph/pyct/transformer.py
index 6ab36b1e581..910cd84403b 100644
--- a/tensorflow/python/autograph/pyct/transformer.py
+++ b/tensorflow/python/autograph/pyct/transformer.py
@@ -93,6 +93,7 @@ class _StateStack(object):
   Attributes:
     type: Any, the type of objects that this stack holds
     level: int, the current stack depth
+    stack: List[Any], the actual stack
     value: Any, the instance of the object at the top of the stack
   """
 
@@ -110,6 +111,10 @@ class _StateStack(object):
   def exit(self):
     return self._stack.pop()
 
+  @property
+  def stack(self):
+    return self._stack
+
   @property
   def level(self):
     return len(self._stack)
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 09a9000ed03..97391d5b9b9 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -27,7 +27,7 @@ import datetime
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 5, 6)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 5, 11)
 
 
 @tf_export("compat.forward_compatible")
@@ -76,9 +76,10 @@ def forward_compatible(year, month, day):
   the code that adds the new operation is committed.
 
   Args:
-    year:  A year (e.g., 2018).
-    month: A month (1 <= month <= 12) in year.
-    day:   A day (1 <= day <= 31, or 30, or 29, or 28) in month.
+    year:  A year (e.g., 2018). Must be an `int`.
+    month: A month (1 <= month <= 12) in year. Must be an `int`.
+    day:   A day (1 <= day <= 31, or 30, or 29, or 28) in month. Must be an
+           `int`.
 
   Returns:
     True if the caller can expect that serialized TensorFlow graphs produced
@@ -118,10 +119,11 @@ def forward_compatibility_horizon(year, month, day):
        # Test that generate_graph_with_new_features() has an effect
   ```
 
-  Args :
-    year:  A year (e.g. 2018).
-    month: A month (1 <= month <= 12) in year.
-    day:   A day (1 <= day <= 31, or 30, or 29, or 28) in month.
+  Args:
+    year:  A year (e.g., 2018). Must be an `int`.
+    month: A month (1 <= month <= 12) in year. Must be an `int`.
+    day:   A day (1 <= day <= 31, or 30, or 29, or 28) in month. Must be an
+           `int`.
 
   Yields:
     Nothing.
diff --git a/tensorflow/python/data/benchmarks/BUILD b/tensorflow/python/data/benchmarks/BUILD
index 031476100f4..8e06bd33ebd 100644
--- a/tensorflow/python/data/benchmarks/BUILD
+++ b/tensorflow/python/data/benchmarks/BUILD
@@ -9,6 +9,7 @@ load("//tensorflow:tensorflow.bzl", "py_test")
 py_test(
     name = "meta_benchmark",
     srcs = ["meta_benchmark.py"],
+    python_version = "PY2",
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:session",
@@ -31,6 +32,7 @@ py_library(
 py_test(
     name = "batch_benchmark",
     srcs = ["batch_benchmark.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":benchmark_base",
@@ -43,6 +45,7 @@ py_test(
 py_test(
     name = "filter_benchmark",
     srcs = ["filter_benchmark.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":benchmark_base",
@@ -53,6 +56,7 @@ py_test(
 py_test(
     name = "from_tensor_slices_benchmark",
     srcs = ["from_tensor_slices_benchmark.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":benchmark_base",
@@ -64,6 +68,7 @@ py_test(
 py_test(
     name = "list_files_benchmark",
     srcs = ["list_files_benchmark.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":benchmark_base",
@@ -79,6 +84,7 @@ py_test(
 py_test(
     name = "map_benchmark",
     srcs = ["map_benchmark.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":benchmark_base",
@@ -89,6 +95,7 @@ py_test(
 py_test(
     name = "range_benchmark",
     srcs = ["range_benchmark.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":benchmark_base",
diff --git a/tensorflow/python/data/benchmarks/benchmark_base.py b/tensorflow/python/data/benchmarks/benchmark_base.py
index 11aaebacc08..85c894551f1 100644
--- a/tensorflow/python/data/benchmarks/benchmark_base.py
+++ b/tensorflow/python/data/benchmarks/benchmark_base.py
@@ -30,7 +30,7 @@ from tensorflow.python.platform import test
 class DatasetBenchmarkBase(test.Benchmark):
   """Base class for dataset benchmarks."""
 
-  def run_benchmark(self, dataset, num_elements, iters=1):
+  def run_benchmark(self, dataset, num_elements, iters=1, warmup=True):
     """Benchmarks the dataset.
 
     Runs the dataset `iters` times. In each iteration, the benchmark measures
@@ -41,6 +41,7 @@ class DatasetBenchmarkBase(test.Benchmark):
       num_elements: Number of dataset elements to iterate through each benchmark
         iteration.
       iters: Number of times to repeat the timing.
+      warmup: If true, warms up the session caches by running an untimed run.
 
     Returns:
       A float, representing the per-element wall time of the dataset in seconds.
@@ -62,9 +63,10 @@ class DatasetBenchmarkBase(test.Benchmark):
     deltas = []
     for _ in range(iters):
       with session.Session() as sess:
-        # Run once to warm up the session caches.
-        sess.run(iterator.initializer)
-        sess.run(next_element)
+        if warmup:
+          # Run once to warm up the session caches.
+          sess.run(iterator.initializer)
+          sess.run(next_element)
 
         sess.run(iterator.initializer)
         start = time.time()
@@ -78,9 +80,10 @@ class DatasetBenchmarkBase(test.Benchmark):
                                num_elements,
                                name,
                                iters=5,
-                               extras=None):
+                               extras=None,
+                               warmup=True):
     # Measure the per-element wall time.
-    wall_time = self.run_benchmark(dataset, num_elements, iters)
+    wall_time = self.run_benchmark(dataset, num_elements, iters, warmup)
 
     if extras is None:
       extras = {}
diff --git a/tensorflow/python/data/experimental/benchmarks/BUILD b/tensorflow/python/data/experimental/benchmarks/BUILD
index 42381a3468a..9d0a263c809 100644
--- a/tensorflow/python/data/experimental/benchmarks/BUILD
+++ b/tensorflow/python/data/experimental/benchmarks/BUILD
@@ -183,6 +183,25 @@ py_test(
     ],
 )
 
+py_test(
+    name = "snapshot_dataset_benchmark",
+    srcs = ["snapshot_dataset_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/benchmarks:benchmark_base",
+        "//tensorflow/python/data/experimental/ops:snapshot",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "unbatch_benchmark",
     srcs = ["unbatch_benchmark.py"],
diff --git a/tensorflow/python/data/experimental/benchmarks/snapshot_dataset_benchmark.py b/tensorflow/python/data/experimental/benchmarks/snapshot_dataset_benchmark.py
new file mode 100644
index 00000000000..79b93c8e3b3
--- /dev/null
+++ b/tensorflow/python/data/experimental/benchmarks/snapshot_dataset_benchmark.py
@@ -0,0 +1,98 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for `tf.data.experimental.snapshot()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+
+from tensorflow.python.client import session
+from tensorflow.python.data.benchmarks import benchmark_base
+from tensorflow.python.data.experimental.ops import snapshot
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors_impl as errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SnapshotDatasetBenchmark(benchmark_base.DatasetBenchmarkBase):
+  """Benchmarks for `tf.data.experimental.snapshot()`."""
+
+  def _makeSnapshotDirectory(self):
+    tmp_dir = test.get_temp_dir()
+    tmp_dir = os.path.join(tmp_dir, "snapshot")
+    if os.path.exists(tmp_dir):
+      shutil.rmtree(tmp_dir)
+    os.mkdir(tmp_dir)
+    return tmp_dir
+
+  def _createSimpleDataset(self, num_elems, tmp_dir=None):
+    if not tmp_dir:
+      tmp_dir = self._makeSnapshotDirectory()
+
+    dataset = dataset_ops.Dataset.from_tensor_slices([1.0])
+    dataset = dataset.map(
+        lambda x: gen_array_ops.broadcast_to(x, [50, 50, 3]))
+    dataset = dataset.repeat(num_elems)
+    dataset = dataset.apply(snapshot.snapshot(tmp_dir))
+
+    return dataset
+
+  def _consumeDataset(self, dataset, num_elems):
+    dataset = dataset.skip(num_elems)
+    next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
+    with session.Session() as sess:
+      try:
+        sess.run(next_element)
+      except errors.OutOfRangeError:
+        pass
+
+  def benchmarkWriteSnapshotSimple(self):
+    num_elems = 500000
+    dataset = self._createSimpleDataset(num_elems)
+
+    # We only run one iteration here because running multiple iterations will
+    # cause the later iterations to simply read from the already written
+    # snapshot rather than write a new one.
+    self.run_and_report_benchmark(dataset, num_elems, "write_simple",
+                                  warmup=False, iters=1)
+
+  def benchmarkPassthroughSnapshotSimple(self):
+    num_elems = 100000
+    tmp_dir = self._makeSnapshotDirectory()
+    dataset = self._createSimpleDataset(num_elems, tmp_dir)
+
+    # Consume only 1 element, thus making sure we don't finalize.
+    self._consumeDataset(dataset, 1)
+
+    self.run_and_report_benchmark(dataset, num_elems, "passthrough_simple")
+
+  def benchmarkReadSnapshotSimple(self):
+    num_elems = 100000
+    tmp_dir = self._makeSnapshotDirectory()
+    dataset = self._createSimpleDataset(num_elems, tmp_dir)
+
+    # consume all the elements to let snapshot write things to disk
+    self._consumeDataset(dataset, num_elems)
+
+    self.run_and_report_benchmark(dataset, num_elems, "read_simple")
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index 20af2322530..d90c7a99176 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -11,6 +11,7 @@ py_test(
     name = "bucket_by_sequence_length_test",
     size = "medium",
     srcs = ["bucket_by_sequence_length_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:array_ops",
@@ -30,6 +31,7 @@ py_test(
 py_test(
     name = "cardinality_test",
     srcs = ["cardinality_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python/data/experimental/ops:cardinality",
@@ -64,6 +66,7 @@ py_test(
     name = "counter_test",
     size = "small",
     srcs = ["counter_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:client_testlib",
@@ -77,6 +80,7 @@ py_test(
     name = "csv_dataset_test",
     size = "medium",
     srcs = ["csv_dataset_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
@@ -97,6 +101,7 @@ py_test(
 py_test(
     name = "dense_to_sparse_batch_test",
     srcs = ["dense_to_sparse_batch_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:array_ops",
@@ -114,6 +119,7 @@ py_test(
     name = "directed_interleave_dataset_test",
     size = "medium",
     srcs = ["directed_interleave_dataset_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:client_testlib",
@@ -130,6 +136,7 @@ py_test(
     name = "auto_shard_dataset_test",
     size = "medium",
     srcs = ["auto_shard_dataset_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
@@ -148,6 +155,7 @@ py_test(
     name = "get_single_element_test",
     size = "small",
     srcs = ["get_single_element_test.py"],
+    python_version = "PY2",
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -167,6 +175,7 @@ py_test(
     name = "group_by_reducer_test",
     size = "medium",
     srcs = ["group_by_reducer_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:array_ops",
@@ -188,6 +197,7 @@ py_test(
     name = "group_by_window_test",
     size = "medium",
     srcs = ["group_by_window_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:array_ops",
@@ -209,6 +219,7 @@ py_test(
 py_test(
     name = "ignore_errors_test",
     srcs = ["ignore_errors_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:array_ops",
@@ -227,6 +238,7 @@ py_test(
     name = "make_batched_features_dataset_test",
     size = "medium",
     srcs = ["make_batched_features_dataset_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
@@ -248,6 +260,7 @@ py_test(
     name = "make_csv_dataset_test",
     size = "medium",
     srcs = ["make_csv_dataset_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
@@ -267,6 +280,7 @@ py_test(
     name = "make_tf_record_dataset_test",
     size = "medium",
     srcs = ["make_tf_record_dataset_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
@@ -284,6 +298,7 @@ py_test(
     name = "map_and_batch_test",
     size = "medium",
     srcs = ["map_and_batch_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:array_ops",
@@ -307,6 +322,7 @@ py_test(
     name = "map_defun_op_test",
     size = "small",
     srcs = ["map_defun_op_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
@@ -332,6 +348,7 @@ py_test(
     name = "matching_files_test",
     size = "small",
     srcs = ["matching_files_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
@@ -351,6 +368,7 @@ py_test(
     name = "override_threadpool_test",
     size = "small",
     srcs = ["override_threadpool_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
@@ -371,6 +389,7 @@ py_test(
     name = "parallel_interleave_test",
     size = "medium",
     srcs = ["parallel_interleave_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
@@ -393,6 +412,7 @@ py_test(
     name = "parse_example_dataset_test",
     size = "small",
     srcs = ["parse_example_dataset_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/core:protos_all_py",
@@ -455,11 +475,13 @@ py_test(
     name = "rebatch_dataset_test",
     size = "small",
     srcs = ["rebatch_dataset_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/data/experimental/ops:batching",
         "//tensorflow/python/data/experimental/ops:distribute",
+        "//tensorflow/python/data/experimental/ops:grouping",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
@@ -471,6 +493,7 @@ py_test(
     name = "rejection_resample_test",
     size = "medium",
     srcs = ["rejection_resample_test.py"],
+    python_version = "PY2",
     shard_count = 2,
     srcs_version = "PY2AND3",
     tags = [
@@ -498,6 +521,7 @@ py_test(
     name = "restructured_dataset_test",
     size = "medium",
     srcs = ["restructured_dataset_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:array_ops",
@@ -514,6 +538,7 @@ py_test(
     name = "scan_test",
     size = "small",
     srcs = ["scan_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
@@ -538,6 +563,7 @@ py_test(
     name = "shuffle_and_repeat_test",
     size = "medium",
     srcs = ["shuffle_and_repeat_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_pip",
@@ -557,6 +583,7 @@ py_test(
 py_test(
     name = "sleep_test",
     srcs = ["sleep_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:client_testlib",
@@ -589,6 +616,7 @@ py_test(
     name = "sql_dataset_test",
     size = "medium",
     srcs = ["sql_dataset_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
@@ -602,6 +630,7 @@ py_test(
 py_test(
     name = "snapshot_test",
     srcs = ["snapshot_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":reader_dataset_ops_test_base",
@@ -621,6 +650,7 @@ py_test(
     name = "stats_dataset_ops_test",
     size = "large",
     srcs = ["stats_dataset_ops_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_pip",
@@ -659,6 +689,7 @@ py_test(
     name = "take_while_test",
     size = "small",
     srcs = ["take_while_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:array_ops",
@@ -681,6 +712,7 @@ py_test(
     name = "tf_record_writer_test",
     size = "small",
     srcs = ["tf_record_writer_test.py"],
+    python_version = "PY2",
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -698,6 +730,7 @@ py_test(
     name = "unbatch_test",
     size = "medium",
     srcs = ["unbatch_test.py"],
+    python_version = "PY2",
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -724,6 +757,7 @@ py_test(
     name = "unique_test",
     size = "small",
     srcs = ["unique_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
@@ -740,6 +774,7 @@ py_test(
 py_test(
     name = "variant_test",
     srcs = ["variant_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
index 8fb86a1fe4e..20ea60b7770 100644
--- a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
@@ -21,12 +21,15 @@ from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.experimental.ops import distribute
+from tensorflow.python.data.experimental.ops import grouping
 from tensorflow.python.data.experimental.ops import scan_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -338,5 +341,25 @@ class RebatchDatasetTest(test_base.DatasetTestBase):
                        for _ in range(2)]
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
+  def testGroupByWindowBatching(self, drop_remainder):
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        [[array_ops.constant(i, dtype=dtypes.int64)] * 3 for i in range(40)])
+    reduce_fn = lambda bucket_id, ds: ds.batch(
+        batch_size=10, drop_remainder=drop_remainder)
+    dataset = dataset.apply(
+        grouping.group_by_window(
+            key_func=lambda x: x[0] % 4, reduce_func=reduce_fn, window_size=10))
+    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=2)
+
+    self.assertEqual([[5, 3] if drop_remainder else [None, 3]],
+                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
+    # pylint: disable=g-complex-comprehension
+    expected_output = [[[j + i * 4 + k * 20] * 3
+                        for i in range(5)]
+                       for j in range(4)
+                       for k in range(2)]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
index f0fc2d57a1f..b566bb68ad9 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
@@ -33,6 +33,7 @@ py_test(
     name = "batch_dataset_serialization_test",
     size = "medium",
     srcs = ["batch_dataset_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -54,6 +55,7 @@ py_test(
     name = "cache_dataset_serialization_test",
     size = "small",
     srcs = ["cache_dataset_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -73,6 +75,7 @@ py_test(
     name = "checkpoint_input_pipeline_hook_test",
     size = "small",
     srcs = ["checkpoint_input_pipeline_hook_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_pip",
@@ -97,6 +100,7 @@ py_test(
     name = "choose_fastest_branch_dataset_serialization_test",
     size = "medium",
     srcs = ["choose_fastest_branch_dataset_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -118,6 +122,7 @@ py_test(
     name = "choose_fastest_dataset_serialization_test",
     size = "small",
     srcs = ["choose_fastest_dataset_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -136,6 +141,7 @@ py_test(
     name = "concatenate_dataset_serialization_test",
     size = "small",
     srcs = ["concatenate_dataset_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -154,6 +160,7 @@ py_test(
     name = "csv_dataset_serialization_test",
     size = "small",
     srcs = ["csv_dataset_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -172,6 +179,7 @@ py_test(
     name = "dataset_constructor_serialization_test",
     size = "medium",
     srcs = ["dataset_constructor_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -191,6 +199,7 @@ py_test(
     name = "auto_shard_dataset_serialization_test",
     size = "medium",
     srcs = ["auto_shard_dataset_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -214,6 +223,7 @@ py_test(
     name = "filter_dataset_serialization_test",
     size = "medium",
     srcs = ["filter_dataset_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -234,6 +244,7 @@ py_test(
     name = "fixed_length_record_dataset_serialization_test",
     size = "medium",
     srcs = ["fixed_length_record_dataset_serialization_test.py"],
+    python_version = "PY2",
     shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
@@ -253,6 +264,7 @@ py_test(
     name = "flat_map_dataset_serialization_test",
     size = "medium",
     srcs = ["flat_map_dataset_serialization_test.py"],
+    python_version = "PY2",
     tags = [
         "no_oss",
         "no_pip",
@@ -278,6 +290,7 @@ py_test(
     name = "group_by_reducer_serialization_test",
     size = "medium",
     srcs = ["group_by_reducer_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -297,6 +310,7 @@ py_test(
     name = "group_by_window_serialization_test",
     size = "medium",
     srcs = ["group_by_window_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -316,6 +330,7 @@ py_test(
     name = "ignore_errors_serialization_test",
     size = "small",
     srcs = ["ignore_errors_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -336,6 +351,7 @@ py_test(
     name = "interleave_dataset_serialization_test",
     size = "medium",
     srcs = ["interleave_dataset_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -357,6 +373,7 @@ py_test(
     name = "map_and_batch_dataset_serialization_test",
     size = "medium",
     srcs = ["map_and_batch_dataset_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -376,6 +393,7 @@ py_test(
     name = "map_dataset_serialization_test",
     size = "medium",
     srcs = ["map_dataset_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -402,6 +420,7 @@ py_test(
     name = "matching_files_dataset_serialization_test",
     size = "small",
     srcs = ["matching_files_dataset_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_windows",
@@ -419,6 +438,7 @@ py_test(
     name = "optimize_dataset_serialization_test",
     size = "small",
     srcs = ["optimize_dataset_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -437,6 +457,7 @@ py_test(
     name = "rebatch_dataset_serialization_test",
     size = "small",
     srcs = ["rebatch_dataset_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -455,6 +476,7 @@ py_test(
     name = "padded_batch_dataset_serialization_test",
     size = "medium",
     srcs = ["padded_batch_dataset_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -475,6 +497,7 @@ py_test(
     name = "parallel_interleave_dataset_serialization_test",
     size = "medium",
     srcs = ["parallel_interleave_dataset_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -496,6 +519,7 @@ py_test(
     name = "parallel_map_dataset_serialization_test",
     size = "medium",
     srcs = ["parallel_map_dataset_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -521,6 +545,7 @@ py_test(
     name = "parse_example_dataset_serialization_test",
     size = "medium",
     srcs = ["parse_example_dataset_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -538,6 +563,7 @@ py_test(
     name = "prefetch_dataset_serialization_test",
     size = "small",
     srcs = ["prefetch_dataset_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -555,6 +581,7 @@ py_test(
     name = "range_dataset_serialization_test",
     size = "small",
     srcs = ["range_dataset_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -579,6 +606,7 @@ py_test(
     name = "sample_from_datasets_serialization_test",
     size = "medium",
     srcs = ["sample_from_datasets_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -597,6 +625,7 @@ py_test(
     name = "scan_dataset_serialization_test",
     size = "small",
     srcs = ["scan_dataset_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -615,6 +644,7 @@ py_test(
     name = "sequence_dataset_serialization_test",
     size = "medium",
     srcs = ["sequence_dataset_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -633,6 +663,7 @@ py_test(
     name = "serialization_integration_test",
     size = "small",
     srcs = ["serialization_integration_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -652,6 +683,7 @@ py_test(
     name = "shard_dataset_serialization_test",
     size = "medium",
     srcs = ["shard_dataset_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -670,6 +702,7 @@ py_test(
     name = "shuffle_and_repeat_dataset_serialization_test",
     size = "medium",
     srcs = ["shuffle_and_repeat_dataset_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -688,6 +721,7 @@ py_test(
     name = "shuffle_dataset_serialization_test",
     size = "medium",
     srcs = ["shuffle_dataset_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -708,6 +742,7 @@ py_test(
     name = "sql_dataset_serialization_test",
     size = "small",
     srcs = ["sql_dataset_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -728,6 +763,7 @@ py_test(
     name = "stats_dataset_serialization_test",
     size = "medium",
     srcs = ["stats_dataset_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -749,6 +785,7 @@ py_test(
     name = "take_while_dataset_serialization_test",
     size = "small",
     srcs = ["take_while_dataset_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -768,6 +805,7 @@ py_test(
     name = "textline_dataset_serialization_test",
     size = "medium",
     srcs = ["textline_dataset_serialization_test.py"],
+    python_version = "PY2",
     shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
@@ -787,6 +825,7 @@ py_test(
     name = "tf_record_dataset_serialization_test",
     size = "medium",
     srcs = ["tf_record_dataset_serialization_test.py"],
+    python_version = "PY2",
     shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
@@ -806,6 +845,7 @@ py_test(
     name = "unbatch_dataset_serialization_test",
     size = "medium",
     srcs = ["unbatch_dataset_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -825,6 +865,7 @@ py_test(
     name = "unique_dataset_serialization_test",
     size = "small",
     srcs = ["unique_dataset_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -843,6 +884,7 @@ py_test(
     name = "zip_dataset_serialization_test",
     size = "small",
     srcs = ["zip_dataset_serialization_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
diff --git a/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py b/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
index 50090f2971e..d21a4814017 100644
--- a/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
@@ -114,6 +114,16 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase):
 
     self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 1)
 
+  def testWriteSnapshotRepeatAfterwards(self):
+    tmpdir = self.makeSnapshotDirectory()
+
+    dataset = dataset_ops.Dataset.range(10)
+    dataset = dataset.apply(snapshot.snapshot(tmpdir))
+    dataset = dataset.repeat(10)
+    self.assertDatasetProduces(dataset, list(range(10)) * 10)
+
+    self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 1)
+
   def testWriteSnapshotMultiFileSuccessful(self):
     tmpdir = self.makeSnapshotDirectory()
 
diff --git a/tensorflow/python/data/experimental/ops/distribute_options.py b/tensorflow/python/data/experimental/ops/distribute_options.py
index d594398ab38..3c5b4a6b520 100644
--- a/tensorflow/python/data/experimental/ops/distribute_options.py
+++ b/tensorflow/python/data/experimental/ops/distribute_options.py
@@ -46,3 +46,10 @@ class DistributeOptions(options.OptionsBase):
       "using strategy.experimental_distribute_dataset(). In other cases, this "
       "option does nothing. If None, defaults to True.",
       default_factory=lambda: True)
+
+  num_devices = options.create_option(
+      name="num_devices",
+      ty=int,
+      docstring=
+      "The number of devices attached to this input pipeline. This will be "
+      "automatically set by MultiDeviceIterator.")
diff --git a/tensorflow/python/data/kernel_tests/dataset_test.py b/tensorflow/python/data/kernel_tests/dataset_test.py
index 91f4bab5631..7edcf0799f6 100644
--- a/tensorflow/python/data/kernel_tests/dataset_test.py
+++ b/tensorflow/python/data/kernel_tests/dataset_test.py
@@ -373,9 +373,6 @@ class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       second_dataset = dataset_ops.Dataset.range(11)
       self.assertEqual(55, self.evaluate(_uses_dataset(second_dataset)))
       first_concrete = _uses_dataset.get_concrete_function(first_dataset)
-      self.skipTest(
-          ("Not currently working: functions treat Datasets as opaque Python "
-           "objects"))
       # The dataset should not be a captured input
       self.assertEmpty(first_concrete.graph.captures)
       # The two datasets have the same structure and so should re-use a trace.
@@ -387,5 +384,25 @@ class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
           _uses_dataset.get_concrete_function(
               dataset_ops.Dataset.zip((first_dataset, second_dataset))))
 
+  def testLimitedRetracingWithCompositeTensors(self):
+    trace_count = [0]
+
+    @def_function.function
+    def f(ds):
+      trace_count[0] += 1
+      counter = np.int64(0)
+      for elem in ds:
+        counter += elem
+      return counter
+
+    dataset = dataset_ops.Dataset.range(5)
+    dataset2 = dataset_ops.Dataset.range(10)
+
+    for _ in range(10):
+      self.assertEqual(self.evaluate(f(dataset)), 10)
+      self.assertEqual(self.evaluate(f(dataset2)), 45)
+      self.assertEqual(trace_count[0], 1)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py b/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
index c379afcb160..6f2bc6cadcd 100644
--- a/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
+++ b/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
@@ -335,6 +335,45 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase,
         self.evaluate(elem_on_2)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class PrefetchWithSlackTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @test_util.run_v1_only("b/121264236")
+  def testPrefetchWithSlackOption(self):
+    dataset = dataset_ops.Dataset.range(10)
+    options = dataset_ops.Options()
+    options.experimental_slack = True
+    dataset = dataset.with_options(options)
+    multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
+        dataset, ["/cpu:1", "/cpu:2"])
+    dataset = multi_device_iterator._dataset  # pylint: disable=protected-access
+    self.assertIn("slack", dataset.options()._static_optimizations())
+    self.assertIn("slack:slack_period:2",
+                  dataset.options()._static_optimization_configs())
+
+    config = config_pb2.ConfigProto(device_count={"CPU": 3})
+    with self.test_session(config=config):
+      self.evaluate(multi_device_iterator.initializer)
+      for i in range(0, 10, 2):
+        elem_on_1, elem_on_2 = multi_device_iterator.get_next()
+        self.assertEqual(i, self.evaluate(elem_on_1))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
+      with self.assertRaises(errors.OutOfRangeError):
+        elem_on_1, elem_on_2 = multi_device_iterator.get_next()
+        self.evaluate(elem_on_1)
+        self.evaluate(elem_on_2)
+
+  def testPrefetchWithSlackOptionWithoutIterator(self):
+    dataset = dataset_ops.Dataset.range(10)
+    options = dataset_ops.Options()
+    options.experimental_slack = True
+    dataset = dataset.with_options(options)
+    self.assertIn("slack", dataset.options()._static_optimizations())
+    self.assertIn("slack:slack_period:1",
+                  dataset.options()._static_optimization_configs())
+
+    self.assertDatasetProduces(dataset, range(10))
+
 if __name__ == "__main__":
   ops.enable_eager_execution(
       config=config_pb2.ConfigProto(device_count={"CPU": 3, "GPU": 1}))
diff --git a/tensorflow/python/data/kernel_tests/prefetch_test.py b/tensorflow/python/data/kernel_tests/prefetch_test.py
index 8d076f6e685..ca59dd067a0 100644
--- a/tensorflow/python/data/kernel_tests/prefetch_test.py
+++ b/tensorflow/python/data/kernel_tests/prefetch_test.py
@@ -40,6 +40,15 @@ class PrefetchTest(test_base.DatasetTestBase, parameterized.TestCase):
       dataset = dataset_ops.Dataset.range(10).prefetch(buffer_size=buffer_size)
       self.evaluate(dataset._variant_tensor)
 
+  @parameterized.parameters(*[(buffer_size, slack_period)
+                              for buffer_size in (-1, None, 0, 5)
+                              for slack_period in (1, 8)])
+  def testPrefetchWithSlack(self, buffer_size, slack_period):
+    dataset = dataset_ops.Dataset.range(100)
+    dataset = dataset_ops.PrefetchDataset(
+        dataset, buffer_size, slack_period=slack_period)
+    self.assertDatasetProduces(dataset, expected_output=range(100))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 937d3bc9c44..7ffd79d71a4 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -43,6 +43,7 @@ from tensorflow.python.data.util import structure as structure_lib
 from tensorflow.python.data.util import traverse
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function as eager_function
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
@@ -80,7 +81,7 @@ ops.NotDifferentiable("ReduceDataset")
 
 @tf_export("data.Dataset", v1=[])
 @six.add_metaclass(abc.ABCMeta)
-class DatasetV2(tracking_base.Trackable):
+class DatasetV2(tracking_base.Trackable, composite_tensor.CompositeTensor):
   """Represents a potentially large set of elements.
 
   A `Dataset` can be used to represent an input pipeline as a
@@ -237,7 +238,10 @@ class DatasetV2(tracking_base.Trackable):
       if t_options.private_threadpool_size is not None:
         dataset = _PrivateThreadPoolDataset(dataset,
                                             t_options.private_threadpool_size)
-    static_optimizations = options._static_optimizations()  # pylint: disable=protected-access
+    # pylint: disable=protected-access
+    static_optimizations = options._static_optimizations()
+    static_optimization_configs = options._static_optimization_configs()
+    # pylint: enable=protected-access
     if static_optimizations:
       if self._has_captured_ref():
         warnings.warn(
@@ -248,7 +252,7 @@ class DatasetV2(tracking_base.Trackable):
             ", ".join(static_optimizations))
       else:
         dataset = _OptimizeDataset(dataset, static_optimizations,
-                                   options._static_optimization_configs())  # pylint: disable=protected-access
+                                   static_optimization_configs)
 
     autotune = True
     cpu_budget = 0  # Indicates that all CPU cores should be used.
@@ -300,6 +304,24 @@ class DatasetV2(tracking_base.Trackable):
     return ("<%s shapes: %s, types: %s>" % (type(self).__name__, output_shapes,
                                             output_types))
 
+  def _to_components(self):
+    return [self._variant_tensor]
+
+  def _component_metadata(self):
+    return self._element_structure
+
+  @classmethod
+  def _from_components(cls, components, metadata):
+    return _VariantDataset(components[0], metadata)
+
+  def _shape_invariant_to_components(self, shape=None):
+    del shape  # not used
+    return tensor_shape.TensorShape([])  # dataset component is always a scalar.
+
+  @property
+  def _is_graph_tensor(self):
+    return hasattr(self._variant_tensor, "graph")
+
   @staticmethod
   def from_tensors(tensors):
     """Creates a `Dataset` with a single element, comprising the given tensors.
@@ -2068,6 +2090,15 @@ class Options(options_lib.OptionsBase):
       "`tf.data.experimental.OptimizationOptions` for more details.",
       default_factory=optimization_options.OptimizationOptions)
 
+  experimental_slack = options_lib.create_option(
+      name="experimental_slack",
+      ty=bool,
+      docstring="Whether to introduce 'slack' in the last `prefetch` of the "
+      "input pipeline, if it exists. This may reduce CPU contention with "
+      "accelerator host-side activity at the start of a step. The slack "
+      "frequency is determined by the number of devices attached to this "
+      "input pipeline. If None, defaults to False.")
+
   experimental_stats = options_lib.create_option(
       name="experimental_stats",
       ty=stats_options.StatsOptions,
@@ -2095,11 +2126,23 @@ class Options(options_lib.OptionsBase):
     exp_stats_options = self.experimental_stats
     if exp_stats_options and exp_stats_options.latency_all_edges:
       result.append("latency_all_edges")
+    if self.experimental_slack:
+      result.append("slack")
     return result
 
   def _static_optimization_configs(self):
     """Produces the list of configurations for enabled static optimizations."""
-    return self.experimental_optimization._static_optimization_configs()  # pylint: disable=protected-access
+    result = []
+    if self.experimental_optimization:
+      result.extend(
+          self.experimental_optimization._static_optimization_configs())  # pylint: disable=protected-access
+
+    if self.experimental_slack:
+      num_devices = self.experimental_distribute.num_devices
+      if num_devices is None:
+        num_devices = 1
+      result.append("slack:slack_period:%d" % num_devices)
+    return result
 
   def merge(self, options):
     """Merges itself with the given `tf.data.Options`.
@@ -2277,6 +2320,14 @@ class DatasetStructure(structure_lib.Structure):
   def __init__(self, element_structure):
     self._element_structure = element_structure
 
+  def __eq__(self, other):
+    # pylint: disable=protected-access
+    return (isinstance(other, DatasetStructure) and
+            self._element_structure == other._element_structure)
+
+  def __hash__(self):
+    return hash(self._element_structure)
+
   @property
   def _flat_shapes(self):
     return [tensor_shape.scalar()]
@@ -2725,6 +2776,7 @@ class RangeDataset(DatasetSource):
   def __init__(self, *args):
     """See `Dataset.range()` for details."""
     self._parse_args(*args)
+    self._structure = structure_lib.TensorStructure(dtypes.int64, [])
     variant_tensor = gen_dataset_ops.range_dataset(
         start=self._start,
         stop=self._stop,
@@ -2754,7 +2806,7 @@ class RangeDataset(DatasetSource):
 
   @property
   def _element_structure(self):
-    return structure_lib.TensorStructure(dtypes.int64, [])
+    return self._structure
 
 
 class CacheDataset(UnaryUnchangedStructureDataset):
@@ -3345,8 +3397,19 @@ class FilterDataset(UnaryUnchangedStructureDataset):
 class PrefetchDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that asynchronously prefetches its input."""
 
-  def __init__(self, input_dataset, buffer_size):
-    """See `Dataset.prefetch()` for details."""
+  def __init__(self, input_dataset, buffer_size, slack_period=None):
+    """See `Dataset.prefetch()` for details.
+
+    Args:
+      input_dataset: The input dataset.
+      buffer_size: See `Dataset.prefetch()` for details.
+      slack_period: (Optional.) An integer. If non-zero, determines the number
+        of GetNext calls before injecting slack into the execution. This may
+        reduce CPU contention at the start of a step. Note that a tensorflow
+        user should not have to set this manually; enable this behavior
+        automatically via `tf.data.Options.experimental_slack` instead. Defaults
+        to None.
+    """
     self._input_dataset = input_dataset
     if buffer_size is None:
       buffer_size = -1  # This is the sentinel for auto-tuning.
@@ -3355,6 +3418,7 @@ class PrefetchDataset(UnaryUnchangedStructureDataset):
     variant_tensor = gen_dataset_ops.prefetch_dataset(
         input_dataset._variant_tensor,  # pylint: disable=protected-access
         buffer_size=self._buffer_size,
+        slack_period=slack_period,
         **flat_structure(self))
     super(PrefetchDataset, self).__init__(input_dataset, variant_tensor)
 
diff --git a/tensorflow/python/data/ops/multi_device_iterator_ops.py b/tensorflow/python/data/ops/multi_device_iterator_ops.py
index 7b8680c24a3..39392359e0e 100644
--- a/tensorflow/python/data/ops/multi_device_iterator_ops.py
+++ b/tensorflow/python/data/ops/multi_device_iterator_ops.py
@@ -210,7 +210,11 @@ class MultiDeviceIterator(object):
       than the max_buffer_size, we set the max_buffer_size to
       prefetch_buffer_size.
     """
+    options = dataset_ops.Options()
+    options.experimental_distribute.num_devices = len(devices)
+    dataset = dataset.with_options(options)
     self._dataset = dataset._apply_options()  # pylint: disable=protected-access
+    self._experimental_slack = dataset.options().experimental_slack
     self._devices = devices
     self._source_device = source_device
     self._source_device_tensor = ops.convert_to_tensor(source_device)
@@ -279,7 +283,11 @@ class MultiDeviceIterator(object):
     ds = self._prototype_device_datasets[i]
     ds = _ReincarnatedPerDeviceGenerator(ds, self._incarnation_id)
     if self._prefetch_buffer_size > 0:
-      ds = ds.prefetch(self._prefetch_buffer_size)
+      if self._experimental_slack:
+        ds = dataset_ops.PrefetchDataset(
+            ds, self._prefetch_buffer_size, slack_period=1)
+      else:
+        ds = ds.prefetch(self._prefetch_buffer_size)
     # TODO(jsimsa): Enable auto-tuning and optimizations when supported for
     # non-CPU devices.
     options = dataset_ops.Options()
diff --git a/tensorflow/python/data/ops/optional_ops.py b/tensorflow/python/data/ops/optional_ops.py
index 8a85a7aca63..a2c0b924097 100644
--- a/tensorflow/python/data/ops/optional_ops.py
+++ b/tensorflow/python/data/ops/optional_ops.py
@@ -155,6 +155,14 @@ class OptionalStructure(structure.Structure):
   def __init__(self, value_structure):
     self._value_structure = value_structure
 
+  def __eq__(self, other):
+    # pylint: disable=protected-access
+    return (isinstance(other, OptionalStructure) and
+            self._value_structure == other._value_structure)
+
+  def __hash__(self):
+    return hash(self._value_structure)
+
   @property
   def _flat_shapes(self):
     return [tensor_shape.scalar()]
diff --git a/tensorflow/python/data/util/structure.py b/tensorflow/python/data/util/structure.py
index 850cdf68f2a..87337dc3edc 100644
--- a/tensorflow/python/data/util/structure.py
+++ b/tensorflow/python/data/util/structure.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import sparse_ops
@@ -56,6 +57,27 @@ class Structure(object):
   the `tf.compat.v1.data.Iterator` and `Optional` classes.
   """
 
+  @abc.abstractmethod
+  def __eq__(self, other):
+    """Returns the this structure and the input structure are equal.
+
+    Args:
+      other: the structure to use for equality check
+
+    Returns:
+      `True` if this and the input structure are equal and `False` otherwise.
+    """
+    raise NotImplementedError("Structure.__eq__()")
+
+  @abc.abstractmethod
+  def __hash__(self):
+    """Returns the hash of this structure.
+
+    Returns:
+      The hash of this structure.
+    """
+    raise NotImplementedError("Structure.__hash__()")
+
   @abc.abstractproperty
   def _flat_shapes(self):
     """A list of shapes matching the shapes of `self._to_tensor_list()`.
@@ -361,6 +383,22 @@ class NestedStructure(Structure):
       self._flat_shapes_list.extend(s._flat_shapes)
       self._flat_types_list.extend(s._flat_types)
 
+  def __eq__(self, other):
+    if not isinstance(other, NestedStructure):
+      return False
+    try:
+      # pylint: disable=protected-access
+      nest.assert_same_structure(self._nested_structure,
+                                 other._nested_structure)
+    except (ValueError, TypeError):
+      return False
+
+    return nest.flatten(self._nested_structure) == nest.flatten(
+        other._nested_structure)
+
+  def __hash__(self):
+    return hash(tuple(nest.flatten(self._nested_structure)))
+
   @property
   def _flat_shapes(self):
     return self._flat_shapes_list
@@ -479,6 +517,14 @@ class TensorStructure(Structure):
     self._dtype = dtypes.as_dtype(dtype)
     self._shape = tensor_shape.as_shape(shape)
 
+  def __eq__(self, other):
+    return (isinstance(other, TensorStructure) and tensor_spec.TensorSpec(
+        self._shape, self._dtype) == tensor_spec.TensorSpec(
+            other._shape, other._dtype))
+
+  def __hash__(self):
+    return hash(tensor_spec.TensorSpec(self._shape, self._dtype))
+
   @property
   def _flat_shapes(self):
     return [self._shape]
@@ -553,6 +599,14 @@ class SparseTensorStructure(Structure):
     self._dtype = dtypes.as_dtype(dtype)
     self._dense_shape = tensor_shape.as_shape(dense_shape)
 
+  def __eq__(self, other):
+    return (isinstance(other, SparseTensorStructure) and tensor_spec.TensorSpec(
+        self._dense_shape, self._dtype) == tensor_spec.TensorSpec(
+            other._dense_shape, other._dtype))
+
+  def __hash__(self):
+    return hash(tensor_spec.TensorSpec(self._dense_shape, self._dtype))
+
   @property
   def _flat_shapes(self):
     # NOTE(mrry): The default flat shape of a boxed `SparseTensor` is `(3,)`,
@@ -631,6 +685,17 @@ class TensorArrayStructure(Structure):
     self._dynamic_size = dynamic_size
     self._infer_shape = infer_shape
 
+  def __eq__(self, other):
+    return (isinstance(other, TensorArrayStructure) and tensor_spec.TensorSpec(
+        self._element_shape, self._dtype) == tensor_spec.TensorSpec(
+            other._element_shape, other._dtype) and
+            self._dynamic_size == other._dynamic_size and
+            self._infer_shape == other._infer_shape)
+
+  def __hash__(self):
+    return hash((tensor_spec.TensorSpec(self._element_shape, self._dtype),
+                 self._dynamic_size, self._infer_shape))
+
   @property
   def _flat_shapes(self):
     # A TensorArray is represented via its variant object, which is a scalar.
@@ -721,6 +786,14 @@ class RaggedTensorStructure(Structure):
     self._shape = tensor_shape.as_shape(shape)
     self._ragged_rank = ragged_rank
 
+  def __eq__(self, other):
+    return (isinstance(other, RaggedTensorStructure) and tensor_spec.TensorSpec(
+        self._shape, self._dtype) == tensor_spec.TensorSpec(
+            other._shape, other._dtype))
+
+  def __hash__(self):
+    return hash(tensor_spec.TensorSpec(self._shape, self._dtype))
+
   @property
   def _flat_shapes(self):
     # A list of shapes matching the shapes of `self._to_tensor_list()`.
diff --git a/tensorflow/python/data/util/structure_test.py b/tensorflow/python/data/util/structure_test.py
index 3fb7793ccec..64fee62cc36 100644
--- a/tensorflow/python/data/util/structure_test.py
+++ b/tensorflow/python/data/util/structure_test.py
@@ -40,38 +40,43 @@ from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import test
 
 
+# NOTE(mrry): Arguments of parameterized tests are lifted into lambdas to make
+# sure they are not executed before the (eager- or graph-mode) test environment
+# has been set up.
+#
+# TODO(jsimsa): Add tests for OptionalStructure and DatasetStructure.
 class StructureTest(test_base.DatasetTestBase, parameterized.TestCase,
                     ragged_test_util.RaggedTensorTestCase):
 
-  # NOTE(mrry): The arguments must be lifted into lambdas because otherwise they
-  # will be executed before the (eager- or graph-mode) test environment has been
-  # set up.
   # pylint: disable=g-long-lambda,protected-access
-  @parameterized.parameters(
-      (lambda: constant_op.constant(37.0), structure.TensorStructure,
+  @parameterized.named_parameters(
+      ("Tensor", lambda: constant_op.constant(37.0), structure.TensorStructure,
        [dtypes.float32], [[]]),
-      (lambda: tensor_array_ops.TensorArray(
+      ("TensorArray", lambda: tensor_array_ops.TensorArray(
           dtype=dtypes.float32, element_shape=(3,), size=0),
        structure.TensorArrayStructure, [dtypes.variant], [None, 3]),
-      (lambda: sparse_tensor.SparseTensor(
+      ("SparseTensor", lambda: sparse_tensor.SparseTensor(
           indices=[[3, 4]], values=[-1], dense_shape=[4, 5]),
        structure.SparseTensorStructure, [dtypes.variant], [None]),
-      (lambda: ragged_factory_ops.constant([[1, 2], [], [4]]),
+      ("RaggedTensor", lambda: ragged_factory_ops.constant([[1, 2], [], [4]]),
        structure.RaggedTensorStructure, [dtypes.variant], [None]),
-      (lambda: (constant_op.constant(37.0), constant_op.constant([1, 2, 3])),
+      ("Nested_0",
+       lambda: (constant_op.constant(37.0), constant_op.constant([1, 2, 3])),
        structure.NestedStructure, [dtypes.float32, dtypes.int32], [[], [3]]),
-      (lambda: {
+      ("Nested_1", lambda: {
           "a": constant_op.constant(37.0),
           "b": constant_op.constant([1, 2, 3])
       }, structure.NestedStructure, [dtypes.float32, dtypes.int32], [[], [3]]),
-      (lambda: {
-          "a": constant_op.constant(37.0),
+      ("Nested_2", lambda: {
+          "a":
+              constant_op.constant(37.0),
           "b": (sparse_tensor.SparseTensor(
               indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
                 sparse_tensor.SparseTensor(
                     indices=[[3, 4]], values=[-1], dense_shape=[4, 5]))
       }, structure.NestedStructure,
-       [dtypes.float32, dtypes.variant, dtypes.variant], [[], None, None]))
+       [dtypes.float32, dtypes.variant, dtypes.variant], [[], None, None]),
+  )
   def testFlatStructure(self, value_fn, expected_structure, expected_types,
                         expected_shapes):
     value = value_fn()
@@ -83,28 +88,27 @@ class StructureTest(test_base.DatasetTestBase, parameterized.TestCase,
       self.assertTrue(
           tensor_shape.as_shape(expected).is_compatible_with(actual))
 
-  @parameterized.parameters(
-      (lambda: constant_op.constant(37.0), lambda: [
+  @parameterized.named_parameters(
+      ("Tensor", lambda: constant_op.constant(37.0), lambda: [
           constant_op.constant(38.0),
           array_ops.placeholder(dtypes.float32),
           variables.Variable(100.0), 42.0,
           np.array(42.0, dtype=np.float32)
-      ], lambda: [constant_op.constant([1.0, 2.0]), constant_op.constant(37)]),
-      (lambda: tensor_array_ops.TensorArray(
-          dtype=dtypes.float32, element_shape=(3,), size=0),
-       lambda: [
-           tensor_array_ops.TensorArray(
-               dtype=dtypes.float32, element_shape=(3,), size=0),
-           tensor_array_ops.TensorArray(
-               dtype=dtypes.float32, element_shape=(3,), size=10)
-       ],
-       lambda: [
-           tensor_array_ops.TensorArray(
-               dtype=dtypes.int32, element_shape=(3,), size=0),
-           tensor_array_ops.TensorArray(
-               dtype=dtypes.float32, element_shape=(), size=0)
-       ]),
-      (lambda: sparse_tensor.SparseTensor(
+      ], lambda: [constant_op.constant([1.0, 2.0]),
+                  constant_op.constant(37)]),
+      ("TensorArray", lambda: tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, element_shape=(3,), size=0), lambda: [
+              tensor_array_ops.TensorArray(
+                  dtype=dtypes.float32, element_shape=(3,), size=0),
+              tensor_array_ops.TensorArray(
+                  dtype=dtypes.float32, element_shape=(3,), size=10)
+          ], lambda: [
+              tensor_array_ops.TensorArray(
+                  dtype=dtypes.int32, element_shape=(3,), size=0),
+              tensor_array_ops.TensorArray(
+                  dtype=dtypes.float32, element_shape=(), size=0)
+          ]),
+      ("SparseTensor", lambda: sparse_tensor.SparseTensor(
           indices=[[3, 4]], values=[-1], dense_shape=[4, 5]),
        lambda: [
            sparse_tensor.SparseTensor(
@@ -122,16 +126,17 @@ class StructureTest(test_base.DatasetTestBase, parameterized.TestCase,
            sparse_tensor.SparseTensor(
                indices=[[3, 4]], values=[-1.0], dense_shape=[4, 5])
        ]),
-      (lambda: ragged_factory_ops.constant([[1, 2], [], [3]]), lambda: [
-          ragged_factory_ops.constant([[1, 2], [3, 4], []]),
-          ragged_factory_ops.constant([[1], [2, 3, 4], [5]]),
-      ], lambda: [
-          ragged_factory_ops.constant(1),
-          ragged_factory_ops.constant([1, 2]),
-          ragged_factory_ops.constant([[1], [2]]),
-          ragged_factory_ops.constant([["a", "b"]]),
-      ]),
-      (lambda: {
+      ("RaggedTensor", lambda: ragged_factory_ops.constant([[1, 2], [], [3]]),
+       lambda: [
+           ragged_factory_ops.constant([[1, 2], [3, 4], []]),
+           ragged_factory_ops.constant([[1], [2, 3, 4], [5]]),
+       ], lambda: [
+           ragged_factory_ops.constant(1),
+           ragged_factory_ops.constant([1, 2]),
+           ragged_factory_ops.constant([[1], [2]]),
+           ragged_factory_ops.constant([["a", "b"]]),
+       ]),
+      ("Nested", lambda: {
           "a": constant_op.constant(37.0),
           "b": constant_op.constant([1, 2, 3])
       }, lambda: [{
@@ -167,25 +172,110 @@ class StructureTest(test_base.DatasetTestBase, parameterized.TestCase,
           s.is_compatible_with(
               structure.Structure.from_value(incompatible_value)))
 
-  @parameterized.parameters(
-      (lambda: constant_op.constant(37.0),),
-      (lambda: sparse_tensor.SparseTensor(
-          indices=[[3, 4]], values=[-1], dense_shape=[4, 5]),),
-      (lambda: tensor_array_ops.TensorArray(
+  @parameterized.named_parameters(
+      ("Tensor", lambda: constant_op.constant(37.0),
+       lambda: constant_op.constant(42.0), lambda: constant_op.constant([5])),
+      ("TensorArray", lambda: tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, element_shape=(3,), size=0),
+       lambda: tensor_array_ops.TensorArray(
+           dtype=dtypes.float32, element_shape=(3,), size=0),
+       lambda: tensor_array_ops.TensorArray(
+           dtype=dtypes.int32, element_shape=(), size=0)),
+      ("SparseTensor", lambda: sparse_tensor.SparseTensor(
+          indices=[[3, 4]], values=[-1], dense_shape=[4, 5]),
+       lambda: sparse_tensor.SparseTensor(
+           indices=[[1, 2]], values=[42], dense_shape=[4, 5]), lambda:
+       sparse_tensor.SparseTensor(indices=[[3]], values=[-1], dense_shape=[5])),
+      ("Nested", lambda: {
+          "a": constant_op.constant(37.0),
+          "b": constant_op.constant([1, 2, 3])
+      }, lambda: {
+          "a": constant_op.constant(42.0),
+          "b": constant_op.constant([4, 5, 6])
+      }, lambda: {
+          "a": constant_op.constant([1, 2, 3]),
+          "b": constant_op.constant(37.0)
+      }),
+  )
+  def testEquality(self, value1_fn, value2_fn, value3_fn):
+    s1 = structure.Structure.from_value(value1_fn())
+    s2 = structure.Structure.from_value(value2_fn())
+    s3 = structure.Structure.from_value(value3_fn())
+    self.assertEqual(s1, s1)
+    self.assertEqual(s1, s2)
+    self.assertNotEqual(s1, s3)
+    self.assertNotEqual(s2, s3)
+
+  @parameterized.named_parameters(
+      ("Tensor", lambda: constant_op.constant(37.0),
+       lambda: constant_op.constant(42.0), lambda: constant_op.constant([5])),
+      ("TensorArray", lambda: tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, element_shape=(3,), size=0),
+       lambda: tensor_array_ops.TensorArray(
+           dtype=dtypes.float32, element_shape=(3,), size=0),
+       lambda: tensor_array_ops.TensorArray(
+           dtype=dtypes.int32, element_shape=(), size=0)),
+      ("SparseTensor", lambda: sparse_tensor.SparseTensor(
+          indices=[[3, 4]], values=[-1], dense_shape=[4, 5]),
+       lambda: sparse_tensor.SparseTensor(
+           indices=[[1, 2]], values=[42], dense_shape=[4, 5]), lambda:
+       sparse_tensor.SparseTensor(indices=[[3]], values=[-1], dense_shape=[5])),
+      ("Nested", lambda: {
+          "a": constant_op.constant(37.0),
+          "b": constant_op.constant([1, 2, 3])
+      }, lambda: {
+          "a": constant_op.constant(42.0),
+          "b": constant_op.constant([4, 5, 6])
+      }, lambda: {
+          "a": constant_op.constant([1, 2, 3]),
+          "b": constant_op.constant(37.0)
+      }),
+  )
+  def testHash(self, value1_fn, value2_fn, value3_fn):
+    s1 = structure.Structure.from_value(value1_fn())
+    s2 = structure.Structure.from_value(value2_fn())
+    s3 = structure.Structure.from_value(value3_fn())
+    self.assertEqual(hash(s1), hash(s1))
+    self.assertEqual(hash(s1), hash(s2))
+    self.assertNotEqual(hash(s1), hash(s3))
+    self.assertNotEqual(hash(s2), hash(s3))
+
+  @parameterized.named_parameters(
+      (
+          "Tensor",
+          lambda: constant_op.constant(37.0),
+      ),
+      (
+          "SparseTensor",
+          lambda: sparse_tensor.SparseTensor(
+              indices=[[3, 4]], values=[-1], dense_shape=[4, 5]),
+      ),
+      ("TensorArray", lambda: tensor_array_ops.TensorArray(
           dtype=dtypes.float32, element_shape=(), size=1).write(0, 7)),
-      (lambda: ragged_factory_ops.constant([[1, 2], [], [3]]),),
-      (lambda: {"a": constant_op.constant(37.0),
-                "b": constant_op.constant([1, 2, 3])},),
-      (lambda: {"a": constant_op.constant(37.0),
-                "b": (sparse_tensor.SparseTensor(
-                    indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
-                      sparse_tensor.SparseTensor(
-                          indices=[[3, 4]], values=[-1], dense_shape=[4, 5]))
-               },),
-      )
+      ("RaggedTensor", lambda: ragged_factory_ops.constant([[1, 2], [], [3]]),),
+      (
+          "Nested_0",
+          lambda: {
+              "a": constant_op.constant(37.0),
+              "b": constant_op.constant([1, 2, 3])
+          },
+      ),
+      (
+          "Nested_1",
+          lambda: {
+              "a":
+                  constant_op.constant(37.0),
+              "b": (sparse_tensor.SparseTensor(
+                  indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
+                    sparse_tensor.SparseTensor(
+                        indices=[[3, 4]], values=[-1], dense_shape=[4, 5]))
+          },
+      ),
+  )
   def testRoundTripConversion(self, value_fn):
     value = value_fn()
     s = structure.Structure.from_value(value)
+
     def maybe_stack_ta(v):
       if isinstance(v, tensor_array_ops.TensorArray):
         return v.stack()
@@ -209,6 +299,7 @@ class StructureTest(test_base.DatasetTestBase, parameterized.TestCase,
         self.assertRaggedEqual(b, a)
       else:
         self.assertAllEqual(b, a)
+
   # pylint: enable=g-long-lambda
 
   def testIncompatibleStructure(self):
@@ -388,33 +479,40 @@ class StructureTest(test_base.DatasetTestBase, parameterized.TestCase,
   @parameterized.named_parameters(
       ("Tensor", dtypes.float32, tensor_shape.scalar(), ops.Tensor,
        structure.TensorStructure(dtypes.float32, [])),
-      ("SparseTensor", dtypes.int32, tensor_shape.matrix(2, 2),
-       sparse_tensor.SparseTensor,
+      ("SparseTensor", dtypes.int32, tensor_shape.matrix(
+          2, 2), sparse_tensor.SparseTensor,
        structure.SparseTensorStructure(dtypes.int32, [2, 2])),
-      ("TensorArray0", dtypes.int32, tensor_shape.as_shape([None, True, 2, 2]),
-       tensor_array_ops.TensorArray,
+      ("TensorArray_0", dtypes.int32, tensor_shape.as_shape(
+          [None, True, 2, 2]), tensor_array_ops.TensorArray,
        structure.TensorArrayStructure(
            dtypes.int32, [2, 2], dynamic_size=None, infer_shape=True)),
-      ("TensorArray1", dtypes.int32, tensor_shape.as_shape([True, None, 2, 2]),
-       tensor_array_ops.TensorArray,
+      ("TensorArray_1", dtypes.int32, tensor_shape.as_shape(
+          [True, None, 2, 2]), tensor_array_ops.TensorArray,
        structure.TensorArrayStructure(
            dtypes.int32, [2, 2], dynamic_size=True, infer_shape=None)),
-      ("TensorArray2", dtypes.int32, tensor_shape.as_shape([True, False, 2, 2]),
-       tensor_array_ops.TensorArray,
+      ("TensorArray_2", dtypes.int32, tensor_shape.as_shape(
+          [True, False, 2, 2]), tensor_array_ops.TensorArray,
        structure.TensorArrayStructure(
            dtypes.int32, [2, 2], dynamic_size=True, infer_shape=False)),
       ("RaggedTensor", dtypes.int32, tensor_shape.matrix(2, 2),
        structure.RaggedTensorStructure(dtypes.int32, [2, 2], 1),
        structure.RaggedTensorStructure(dtypes.int32, [2, 2], 1)),
-      ("Nest",
-       {"a": dtypes.float32, "b": (dtypes.int32, dtypes.string)},
-       {"a": tensor_shape.scalar(),
-        "b": (tensor_shape.matrix(2, 2), tensor_shape.scalar())},
-       {"a": ops.Tensor, "b": (sparse_tensor.SparseTensor, ops.Tensor)},
+      ("Nested", {
+          "a": dtypes.float32,
+          "b": (dtypes.int32, dtypes.string)
+      }, {
+          "a": tensor_shape.scalar(),
+          "b": (tensor_shape.matrix(2, 2), tensor_shape.scalar())
+      }, {
+          "a": ops.Tensor,
+          "b": (sparse_tensor.SparseTensor, ops.Tensor)
+      },
        structure.NestedStructure({
-           "a": structure.TensorStructure(dtypes.float32, []),
+           "a":
+               structure.TensorStructure(dtypes.float32, []),
            "b": (structure.SparseTensorStructure(dtypes.int32, [2, 2]),
-                 structure.TensorStructure(dtypes.string, []))})),
+                 structure.TensorStructure(dtypes.string, []))
+       })),
   )
   def testConvertLegacyStructure(self, output_types, output_shapes,
                                  output_classes, expected_structure):
@@ -471,7 +569,7 @@ class StructureTest(test_base.DatasetTestBase, parameterized.TestCase,
       ("RaggedTensorUnknown",
        structure.RaggedTensorStructure(dtypes.float32, [4, None], 1), None,
        structure.RaggedTensorStructure(dtypes.float32, [None, 4, None], 2)),
-      ("Nest", structure.NestedStructure({
+      ("Nested", structure.NestedStructure({
           "a": structure.TensorStructure(dtypes.float32, []),
           "b": (structure.SparseTensorStructure(dtypes.int32, [2, 2]),
                 structure.TensorStructure(dtypes.string, []))}), 128,
@@ -505,7 +603,7 @@ class StructureTest(test_base.DatasetTestBase, parameterized.TestCase,
       ("RaggedTensorUnknown",
        structure.RaggedTensorStructure(dtypes.float32, [None, None, 4], 2),
        structure.RaggedTensorStructure(dtypes.float32, [None, 4], 1)),
-      ("Nest", structure.NestedStructure({
+      ("Nested", structure.NestedStructure({
           "a": structure.TensorStructure(dtypes.float32, [128]),
           "b": (structure.SparseTensorStructure(dtypes.int32, [128, 2, 2]),
                 structure.TensorStructure(dtypes.string, [None]))}),
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 9c4b57edfe6..2b8ca7eecc5 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -116,6 +116,7 @@ py_library(
 py_binary(
     name = "grpc_tensorflow_server",
     srcs = ["lib/grpc_tensorflow_server.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [":grpc_tensorflow_server_lib"],
 )
@@ -381,6 +382,7 @@ py_library(
 py_binary(
     name = "offline_analyzer",
     srcs = ["cli/offline_analyzer.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [":offline_analyzer_lib"],
 )
@@ -412,6 +414,7 @@ py_library(
 py_binary(
     name = "debug_fibonacci",
     srcs = ["examples/debug_fibonacci.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [":debug_fibonacci_lib"],
 )
@@ -431,6 +434,7 @@ py_library(
 py_binary(
     name = "debug_errors",
     srcs = ["examples/debug_errors.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [":debug_errors_lib"],
 )
@@ -449,6 +453,7 @@ py_library(
 py_binary(
     name = "debug_mnist",
     srcs = ["examples/debug_mnist.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [":debug_mnist_lib"],
 )
@@ -467,6 +472,7 @@ py_library(
 py_binary(
     name = "debug_tflearn_iris",
     srcs = ["examples/debug_tflearn_iris.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [":debug_tflearn_iris_lib"],
 )
@@ -485,6 +491,7 @@ py_library(
 py_binary(
     name = "debug_keras",
     srcs = ["examples/debug_keras.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [":debug_keras_lib"],
 )
@@ -504,6 +511,7 @@ py_test(
     name = "common_test",
     size = "small",
     srcs = ["lib/common_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":common",
@@ -518,6 +526,7 @@ py_test(
     name = "debug_graphs_test",
     size = "small",
     srcs = ["lib/debug_graphs_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":debug_graphs",
@@ -530,6 +539,7 @@ py_test(
     name = "debug_data_test",
     size = "small",
     srcs = ["lib/debug_data_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":debug_data",
@@ -565,6 +575,7 @@ py_test(
     name = "debug_utils_test",
     size = "small",
     srcs = ["lib/debug_utils_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":debug_utils",
@@ -584,6 +595,7 @@ py_test(
     name = "source_utils_test",
     size = "small",
     srcs = ["lib/source_utils_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":debug_data",
@@ -610,6 +622,7 @@ py_test(
     name = "source_remote_test",
     size = "small",
     srcs = ["lib/source_remote_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_windows",
@@ -636,6 +649,7 @@ py_test(
     name = "framework_test",
     size = "medium",
     srcs = ["wrappers/framework_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":debug_data",
@@ -660,6 +674,7 @@ py_test(
     name = "profiling_test",
     size = "small",
     srcs = ["lib/profiling_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":profiling",
@@ -673,6 +688,7 @@ py_test(
     name = "curses_ui_test",
     size = "small",
     srcs = ["cli/curses_ui_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "no_windows",
@@ -693,6 +709,7 @@ py_test(
     name = "readline_ui_test",
     size = "small",
     srcs = ["cli/readline_ui_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":cli_config",
@@ -832,6 +849,7 @@ py_test(
     name = "debugger_cli_common_test",
     size = "small",
     srcs = ["cli/debugger_cli_common_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":debugger_cli_common",
@@ -846,6 +864,7 @@ py_test(
     name = "cli_config_test",
     size = "small",
     srcs = ["cli/cli_config_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":cli_config",
@@ -859,6 +878,7 @@ py_test(
     name = "command_parser_test",
     size = "small",
     srcs = ["cli/command_parser_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":command_parser",
@@ -871,6 +891,7 @@ py_test(
     name = "tensor_format_test",
     size = "small",
     srcs = ["cli/tensor_format_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":cli_test_utils",
@@ -888,6 +909,7 @@ py_test(
     name = "cli_shared_test",
     size = "small",
     srcs = ["cli/cli_shared_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":cli_shared",
@@ -907,6 +929,7 @@ py_test(
     srcs = [
         "cli/evaluator_test.py",
     ],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":debug_data",
@@ -958,6 +981,7 @@ py_test(
     name = "profile_analyzer_cli_test",
     size = "small",
     srcs = ["cli/profile_analyzer_cli_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":debugger_cli_common",
@@ -1061,6 +1085,7 @@ py_test(
     name = "dumping_wrapper_test",
     size = "small",
     srcs = ["wrappers/dumping_wrapper_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":debug_data",
@@ -1083,6 +1108,7 @@ py_test(
     name = "local_cli_wrapper_test",
     size = "small",
     srcs = ["wrappers/local_cli_wrapper_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":cli_shared",
@@ -1110,6 +1136,7 @@ py_test(
     name = "disk_usage_test",
     size = "small",
     srcs = ["wrappers/disk_usage_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":dumping_wrapper",
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 3b070c29f90..c71041042ca 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -977,6 +977,9 @@ distribute_py_test(
     size = "medium",
     srcs = ["saved_model_test.py"],
     main = "saved_model_test.py",
+    tags = [
+        "no_pip",  # b/131691139
+    ],
     deps = [
         ":combinations",
         ":model_combinations",
diff --git a/tensorflow/python/distribute/multi_worker_test_base.py b/tensorflow/python/distribute/multi_worker_test_base.py
index 54677df30e3..caedfbb4abc 100644
--- a/tensorflow/python/distribute/multi_worker_test_base.py
+++ b/tensorflow/python/distribute/multi_worker_test_base.py
@@ -379,7 +379,6 @@ class IndependentWorkerTestBase(test.TestCase):
   """Testing infra for independent workers."""
 
   def _make_mock_run_std_server(self):
-    thread_local = threading.local()
 
     def _mock_run_std_server(*args, **kwargs):
       ret = original_run_std_server(*args, **kwargs)
@@ -387,9 +386,9 @@ class IndependentWorkerTestBase(test.TestCase):
       # of remote sessions taking local ports that have been assigned to std
       # servers. Only call this barrier the first time this function is run for
       # each thread.
-      if not getattr(thread_local, 'server_started', False):
+      if not getattr(self._thread_local, 'server_started', False):
         self._barrier.wait()
-      thread_local.server_started = True
+      self._thread_local.server_started = True
       return ret
 
     return _mock_run_std_server
@@ -401,6 +400,8 @@ class IndependentWorkerTestBase(test.TestCase):
     self._coord = coordinator.Coordinator()
     super(IndependentWorkerTestBase, self).setUp()
     self._mock_context.__enter__()
+    # threading local object to be shared by all threads
+    self._thread_local = threading.local()
 
   def tearDown(self):
     self._mock_context.__exit__(None, None, None)
@@ -421,18 +422,39 @@ class IndependentWorkerTestBase(test.TestCase):
 
   def _run_task_in_thread(self, task_fn, cluster_spec, task_type, task_id,
                           *args, **kwargs):
-    if task_type:
-      tf_config = {
-          'cluster': cluster_spec,
-          'task': {
-              'type': task_type,
-              'index': task_id
-          }
-      }
-    else:
-      tf_config = {
-          'cluster': cluster_spec,
-      }
+    """Run tasks in a thread.
+
+    If `tf_config` is provided, use it for the new thread; if not, construct one
+    from `cluster_spec`, `task_type`, and `task_id`, and provide it to the new
+    thread to be set as `TF_CONFIG` environment.
+
+    Arguments:
+      task_fn: The function to run in the new thread.
+      cluster_spec: The cluster spec.
+      task_type: The task type.
+      task_id: The task id.
+      *args: Additional positional arguments to provide to the thread's task_fn.
+      **kwargs: Additional keyword arguments to provide to the thread's task_fn.
+        If `tf_config` is provided, that dict will be used for the TF_CONFIG for
+        the new thread.
+
+    Returns:
+      The thread that has started.
+    """
+    tf_config = kwargs.pop('tf_config', None)
+    if tf_config is None:
+      if task_type:
+        tf_config = {
+            'cluster': cluster_spec,
+            'task': {
+                'type': task_type,
+                'index': task_id
+            }
+        }
+      else:
+        tf_config = {
+            'cluster': cluster_spec,
+        }
     t = threading.Thread(
         target=self._task_thread,
         args=(task_fn, tf_config, context.executing_eagerly()) + args,
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index a12b0dbe5a0..7cf1a00021c 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -233,6 +233,69 @@ LogicalDeviceSpec = collections.namedtuple(
     "LogicalDeviceSpec", ("device_map", "logical_device"))
 
 
+class WorkerDeviceMap(DeviceMap):
+  """A device map for one value per worker."""
+
+  def __init__(self, devices, num_replicas_per_worker):
+    """Initialize a `WorkerDeviceMap`.
+
+    Args:
+      devices: `devices[i]` is the string device for worker `i` in in-graph
+        relication case; devices is single-element list for its corresponding
+        worker in between-graph case.
+      num_replicas_per_worker: number of replicas per worker, useful in in-graph
+        replication case.
+    """
+    self._devices = tuple(device_util.canonicalize(d) for d in devices)
+    if len(set(self._devices)) != len(self._devices):
+      raise ValueError("Duplicate devices in %s, after canonicalization: %s" %
+                       (devices, self._devices))
+    self._num_replicas_per_worker = num_replicas_per_worker
+
+  @property
+  def all_devices(self):
+    return self._devices
+
+  @property
+  def devices_by_replica(self):
+    raise ValueError("`WorkerDeviceMap` is not indexed by replicas")
+
+  @property
+  def num_logical_devices(self):
+    return 1
+
+  @property
+  def num_replicas_in_graph(self):
+    return len(self._devices)
+
+  def logical_device_from_values(self, values):
+    del values
+    return 0
+
+  def logical_to_actual_devices(self, logical_device_id):
+    assert logical_device_id == 0
+    return self._devices
+
+  def select_for_current_replica(self, values, replica_context):
+    return values[replica_context.replica_id_in_sync_group //
+                  self._num_replicas_per_worker]
+
+  def replica_for_device(self, device):
+    raise ValueError("`WorkerDeviceMap` not indexed by replicas")
+
+  def select_for_device(self, values, device):
+    # TODO(yuefengz): this should map from any device to the value on its
+    # corresponding worker.
+    return values[self._devices.index(device_util.canonicalize(device))]
+
+  def is_device_in_replica(self, device, replica_id):
+    raise ValueError("WorkerDeviceMap not indexed by replicas")
+
+  def __repr__(self):
+    return "%s(%r, num_replicas_per_worker=%d)" % (
+        self.__class__.__name__, self._devices, self._num_replicas_per_worker)
+
+
 class DistributedValues(object):
   """Holds a map from device to values. Either PerReplica or Mirrored."""
 
diff --git a/tensorflow/python/distribute/values_test.py b/tensorflow/python/distribute/values_test.py
index a96f9f36b89..2312973430a 100644
--- a/tensorflow/python/distribute/values_test.py
+++ b/tensorflow/python/distribute/values_test.py
@@ -945,5 +945,64 @@ class PerReplicaTest(test.TestCase):
     self.assertEqual(x._component_metadata(), y._component_metadata())
 
 
+class WorkerDeviceMapTest(test.TestCase):
+
+  class ReplicaContext(object):
+
+    def __init__(self, replica_id_in_sync_group):
+      self.replica_id_in_sync_group = replica_id_in_sync_group
+
+  def testBasic(self):
+    devices = [
+        "/job:worker/replica:0/task:0/device:CPU:0",
+        "/job:worker/replica:0/task:2/device:CPU:0"
+    ]
+    device_map = values.WorkerDeviceMap(devices, 1)
+    self.assertAllEqual(devices, device_map.all_devices)
+
+    # pylint:disable=pointless-statement
+    with self.assertRaisesWithPredicateMatch(
+        ValueError, "`WorkerDeviceMap` is not indexed by replicas"):
+      device_map.devices_by_replica
+
+    self.assertEqual(1, device_map.num_logical_devices)
+
+    self.assertEqual(2, device_map.num_replicas_in_graph)
+
+    self.assertEqual(0, device_map.logical_device_from_values(["a", "b"]))
+
+    self.assertAllEqual(devices, device_map.logical_to_actual_devices(0))
+
+    replica_context = WorkerDeviceMapTest.ReplicaContext(1)
+    self.assertEqual(
+        "b", device_map.select_for_current_replica(["a", "b"], replica_context))
+
+    with self.assertRaisesWithPredicateMatch(
+        ValueError, "`WorkerDeviceMap` not indexed by replicas"):
+      device_map.replica_for_device(devices[1])
+
+    self.assertEqual("b", device_map.select_for_device(["a", "b"], devices[1]))
+
+    with self.assertRaisesWithPredicateMatch(
+        ValueError, "WorkerDeviceMap not indexed by replicas"):
+      device_map.is_device_in_replica(devices[1], 1)
+
+    self.assertEqual(
+        "WorkerDeviceMap(('/job:worker/replica:0/task:0/device:CPU:0', "
+        "'/job:worker/replica:0/task:2/device:CPU:0'), "
+        "num_replicas_per_worker=1)", repr(device_map))
+
+  def testMultipleReplicasPerWorker(self):
+    devices = [
+        "/job:worker/replica:0/task:0/device:CPU:0",
+        "/job:worker/replica:0/task:2/device:CPU:0"
+    ]
+    device_map = values.WorkerDeviceMap(devices, 2)
+
+    replica_context = WorkerDeviceMapTest.ReplicaContext(3)
+    self.assertEqual(
+        "b", device_map.select_for_current_replica(["a", "b"], replica_context))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index a69ad1d7e42..7149245a7ec 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -1194,6 +1194,9 @@ class Context(object):
 
   @log_device_placement.setter
   def log_device_placement(self, enabled):
+    if self._log_device_placement == enabled:
+      return
+
     if self._context_handle is not None:
       raise RuntimeError(
           "Device placement logging must be set at program startup")
diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py
index bdcc928da9f..4a5f481114f 100644
--- a/tensorflow/python/eager/def_function.py
+++ b/tensorflow/python/eager/def_function.py
@@ -58,6 +58,7 @@ class UnliftedInitializerVariable(resource_variable_ops.UninitializedVariable):
                lifted_initializer_graph=None,
                synchronization=None,
                aggregation=None,
+               shape=None,
                **unused_kwargs):
     """Creates a variable.
 
@@ -101,6 +102,10 @@ class UnliftedInitializerVariable(resource_variable_ops.UninitializedVariable):
       aggregation: Indicates how a distributed variable will be aggregated.
         Accepted values are constants defined in the class
         `tf.VariableAggregation`.
+      shape: (optional) The shape of this variable. If None, the shape of
+        `initial_value` will be used. When setting this argument to
+        `tf.TensorShape(None)` (representing an unspecified shape), the variable
+        can be assigned with values of different shapes.
 
     Raises:
       ValueError: If the initial value is not specified, or does not have a
@@ -135,12 +140,17 @@ class UnliftedInitializerVariable(resource_variable_ops.UninitializedVariable):
             name="initial_value", dtype=dtype)
       assert initial_value is not None
 
+      # Don't use `shape or initial_value.shape` since TensorShape has
+      # overridden `__bool__`.
+      if shape is None:
+        shape = initial_value.shape
+
       # Use the constructor for UninitializedVariable to start.
       super(UnliftedInitializerVariable, self).__init__(
           trainable=trainable,
           caching_device=caching_device,
           name=name,
-          shape=initial_value.shape,
+          shape=shape,
           dtype=initial_value.dtype,
           constraint=constraint,
           synchronization=synchronization,
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 3fb44b46205..7993abc7e64 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -1802,7 +1802,13 @@ PyObject* GetPythonObjectFromInt(int num) {
 }
 
 bool CheckResourceVariable(PyObject* item) {
-  return PyObject_TypeCheck(item, resource_variable_type);
+  if (PyObject_TypeCheck(item, resource_variable_type)) {
+    tensorflow::Safe_PyObjectPtr handle(
+        PyObject_GetAttrString(item, "_handle"));
+    return EagerTensor_CheckExact(handle.get());
+  }
+
+  return false;
 }
 
 bool IsNumberType(PyObject* item) {
diff --git a/tensorflow/python/eager/pywrap_tfe_test.py b/tensorflow/python/eager/pywrap_tfe_test.py
index 9bded967e18..5299d1ecebe 100644
--- a/tensorflow/python/eager/pywrap_tfe_test.py
+++ b/tensorflow/python/eager/pywrap_tfe_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -255,6 +256,18 @@ class Tests(test.TestCase):
                                  "Expected list for 'values' argument"):
       _ = array_ops.stack(value, axis=1)
 
+  def testGraphResourceVariableRaisesFallback(self):
+    with ops.Graph().as_default():
+      a_2_by_2 = constant_op.constant(1.0, shape=[2, 2])
+      m = resource_variable_ops.ResourceVariable(a_2_by_2)
+    ctx = context.context()
+    ctx.ensure_initialized()
+    with self.assertRaises(core._FallbackException):
+      pywrap_tensorflow.TFE_Py_FastPathExecute(ctx._handle, ctx.device_name,
+                                               "MatMul", None, None, m, m,
+                                               "transpose_a", False,
+                                               "transpose_b", False)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/eager/wrap_function.py b/tensorflow/python/eager/wrap_function.py
index a529c1fbd9b..d9789dd571f 100644
--- a/tensorflow/python/eager/wrap_function.py
+++ b/tensorflow/python/eager/wrap_function.py
@@ -21,11 +21,14 @@ from __future__ import print_function
 
 import weakref
 
+from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.eager import function
 from tensorflow.python.eager import lift_to_graph
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training.tracking import data_structures
@@ -84,6 +87,45 @@ class VariableHolder(object):
     return wrapped
 
 
+def _get_tensor_from_tensor_info(tensor_info, graph):
+  """Simplified copy of the deprecated `get_tensor_from_tensor_info`."""
+  encoding = tensor_info.WhichOneof("encoding")
+  if encoding == "name":
+    return graph.get_tensor_by_name(tensor_info.name)
+  elif encoding == "coo_sparse":
+    return sparse_tensor.SparseTensor(
+        graph.get_tensor_by_name(tensor_info.coo_sparse.indices_tensor_name),
+        graph.get_tensor_by_name(tensor_info.coo_sparse.values_tensor_name),
+        graph.get_tensor_by_name(
+            tensor_info.coo_sparse.dense_shape_tensor_name))
+  else:
+    raise ValueError("Invalid TensorInfo.encoding: %s" % encoding)
+
+
+def _lift_single_variable(old_variable, graph, variable_holder):
+  """Lifts `old_variable` out of the `FuncGraph` `graph`."""
+  new_variable = resource_variable_ops.UninitializedVariable(
+      shape=old_variable.shape,
+      dtype=old_variable.dtype,
+      name=old_variable.op.name,
+      trainable=old_variable.trainable,
+      extra_handle_data=old_variable.handle)
+  new_variable._initializer_op = old_variable._initializer_op  # pylint: disable=protected-access
+  graph.inputs.append(old_variable.handle)
+  graph.captures[new_variable.handle] = old_variable.handle
+  # Now that we've added the new variable to graph.captures,
+  # graph.capture will use that cached value and do some post-processing
+  # on the capture like recording it on the tape.
+  graph.capture(new_variable.handle)
+  # pylint: disable=protected-access
+  variable_name = new_variable.name.split(":")[0]
+  variable_holder._variables_by_name[variable_name] = new_variable
+  graph._weak_variables.append(weakref.ref(new_variable))
+  # pylint: enable=protected-access
+  graph.watch_variable(new_variable)
+  return new_variable
+
+
 def _lift_unlifted_variables(graph, variable_holder):
   """Finds resource variables and lifts them into the outer context.
 
@@ -100,39 +142,44 @@ def _lift_unlifted_variables(graph, variable_holder):
     variable_holder: A VariableHolder to record the lifted variables in.
   """
   with graph.as_default():
-    collection_variables = (
-        ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) +
-        ops.get_collection(ops.GraphKeys.LOCAL_VARIABLES))
+    global_collection_variables = ops.get_collection(
+        ops.GraphKeys.GLOBAL_VARIABLES)
+    local_collection_variables = ops.get_collection(
+        ops.GraphKeys.LOCAL_VARIABLES)
     existing_captures = set(graph.internal_captures)
     lifted_variables = {}
-    for old_variable in collection_variables:
-      if (old_variable._in_graph_mode  # pylint: disable=protected-access
-          and
-          isinstance(old_variable, resource_variable_ops.ResourceVariable)):
-        if old_variable.handle in existing_captures:
-          continue
-        new_variable = resource_variable_ops.UninitializedVariable(
-            shape=old_variable.shape,
-            dtype=old_variable.dtype,
-            name=old_variable.op.name,
-            trainable=old_variable.trainable,
-            extra_handle_data=old_variable.handle)
-        new_variable._initializer_op = old_variable._initializer_op  # pylint: disable=protected-access
-        graph.inputs.append(old_variable.handle)
-        graph.captures[new_variable.handle] = old_variable.handle
-        # Now that we've added the new variable to graph.captures,
-        # graph.capture will use that cached value and do some post-processing
-        # on the capture like recording it on the tape.
-        graph.capture(new_variable.handle)
-        existing_captures.add(old_variable.handle)
+
+    def _should_lift_variable(v):
+      return ((v._in_graph_mode  # pylint: disable=protected-access
+               and v.graph.building_function)
+              and isinstance(v, resource_variable_ops.ResourceVariable)
+              and v.handle not in existing_captures)
+
+    for old_variable in global_collection_variables:
+      if _should_lift_variable(old_variable):
+        new_variable = _lift_single_variable(
+            old_variable, graph, variable_holder)
         lifted_variables[old_variable] = new_variable
-        # pylint: disable=protected-access
-        variable_name = new_variable.name.split(":")[0]
-        variable_holder._variables_by_name[variable_name] = new_variable
-        graph._weak_variables.append(weakref.ref(new_variable))
-        # pylint: enable=protected-access
-        graph.watch_variable(new_variable)
-    # Update the graph's collections, partly for the user and partly so this
+        existing_captures.add(old_variable.handle)
+
+    for old_variable in local_collection_variables:
+      if _should_lift_variable(old_variable):
+        new_variable = _lift_single_variable(
+            old_variable, graph, variable_holder)
+        lifted_variables[old_variable] = new_variable
+        existing_captures.add(old_variable.handle)
+        if new_variable._in_graph_mode:  # pylint: disable=protected-access
+          outer_graph = new_variable.graph
+          # Variables are added to the global collection by default. In this
+          # case we only want the variable in the local collection, so we'll pop
+          # it out.
+          global_collection = outer_graph.get_collection_ref(
+              ops.GraphKeys.GLOBAL_VARIABLES)
+          global_collection.remove(new_variable)
+          outer_graph.add_to_collection(
+              ops.GraphKeys.LOCAL_VARIABLES, new_variable)
+
+    # Update the FuncGraph's collections, partly for the user and partly so this
     # function is idempotent when it runs again in prune() calls.
     for collection_name in [
         ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.LOCAL_VARIABLES
@@ -148,18 +195,40 @@ class WrappedFunction(function.ConcreteFunction):
 
   def __init__(self, fn_graph, variable_holder, attrs=None, signature=None):
     self._variable_holder = variable_holder
-    if ops.executing_eagerly_outside_functions():
-      # TODO(allenl): Make this work in 1.x?
-      _lift_unlifted_variables(fn_graph, variable_holder)
+    _lift_unlifted_variables(fn_graph, variable_holder)
     # We call __init__ after lifting variables so that the function's signature
     # properly reflects the new captured inputs.
     super(WrappedFunction, self).__init__(
         fn_graph, attrs=attrs, signature=signature)
 
   def prune(self, feeds, fetches, name=None, input_signature=None):
+    """Extract a subgraph of this function's underlying graph.
+
+    Wraps the subgraph in a new `WrappedFunction` object.
+
+    Args:
+      feeds: Input tensors to the subgraph to extract, as `Tensor` objects.
+      fetches: Possibly-nested Python data structure containing information
+        about outputs of the target subgraph. Each entry can either be a
+        `Tensor` object (for data outputs), an `Operation` object (for control
+        outputs), or a `TensorInfo` proto. Any additional shape/dtype
+        information provided in a `TensorInfo` and not present in the original
+        graph will be added to the returned subgraph.
+      name: (optional) Name to give to the underlying `FuncGraph` of the
+        returned object. If no name is provided, the graph's name will be
+        `"pruned"`.
+      input_signature: (optional) possibly-nested Python data structure
+        containing `TensorSpec` objects, with which to populate the returned
+        functions's `FuncGraph`'s `structured_input_signature` field.
+
+    Returns:
+      A new `WrappedFunction` object containing a copy of the portion of this
+        object's graph that goes from `feeds` to `fetches`.
+    """
     # TODO(b/129646028): Add support for CompositeTensors.
     name = name or "pruned"
-    flat_feeds, flat_fetches = nest.flatten(feeds), nest.flatten(fetches)
+    feeds = nest.map_structure(self.graph.as_graph_element, feeds)
+    flat_feeds = nest.flatten(feeds)
     for f in flat_feeds:
       if not isinstance(f, ops.Tensor):
         raise ValueError("Feeds must be tensors.")
@@ -170,28 +239,64 @@ class WrappedFunction(function.ConcreteFunction):
     flat_feeds = [f for f in flat_feeds if f not in internal_captures]
 
     operation_fetches = []
-    for f in flat_fetches:
+    tensor_fetches = []
+    tensor_infos = []
+
+    def _fetch_preprocesing_callback(f):
+      """Extract out lists of ops, tensors, and tensor type info.
+
+      Turns TensorInfos into Tensors in the original fetches structure.
+
+      Args:
+        f: The fetch to preprocess: Tensor, TensorInfo, or Operation, or string
+          identifying a Tensor or Operation.
+
+      Returns:
+        `f` converted to a Tensor.
+      """
       if isinstance(f, ops.Operation):
         operation_fetches.append(f)
-      elif not isinstance(f, ops.Tensor):
-        raise ValueError("Fetches must be tensors or operations.")
-    for f in flat_feeds + flat_fetches:
+        return f
+      elif isinstance(f, meta_graph_pb2.TensorInfo):
+        tensor_infos.append(f)
+        f_tensor = _get_tensor_from_tensor_info(f, self._func_graph)
+        tensor_fetches.append(f_tensor)
+        return f_tensor
+      elif isinstance(f, ops.Tensor):
+        tensor_fetches.append(f)
+        return f
+      else:
+        graph_element = self.graph.as_graph_element(f)
+        return _fetch_preprocesing_callback(graph_element)
+
+    fetches = nest.map_structure(_fetch_preprocesing_callback, fetches)
+
+    for f in flat_feeds + tensor_fetches + operation_fetches:
       if f.graph is not self._func_graph:
         raise ValueError("Can only prune function whose feeds and fetches "
-                         "are from this graph (%s). Tensor %s from graph %s" %
+                         "are from this graph (%s). Input %s is from graph %s" %
                          (self._func_graph, f, f.graph))
     with self._func_graph.as_default():
       pruned_graph = func_graph.FuncGraph(name)
     lift_map = lift_to_graph.lift_to_graph(
-        flat_fetches, pruned_graph, sources=flat_feeds + internal_captures)
-    pruned_graph.outputs.extend(
-        lift_map[x] for x in flat_fetches if isinstance(x, ops.Tensor))
+        operation_fetches + tensor_fetches,
+        pruned_graph,
+        sources=flat_feeds + internal_captures)
+    pruned_graph.outputs.extend(lift_map[x] for x in tensor_fetches)
     pruned_graph.control_outputs.extend(
         [lift_map[operation] for operation in operation_fetches])
     for external_capture, internal_capture in self.graph.captures.items():
       pruned_graph.captures[external_capture] = lift_map[internal_capture]
     pruned_graph.inputs.extend(lift_map[x] for x in flat_feeds)
     pruned_graph.inputs.extend(pruned_graph.captures.values())
+    for ti in tensor_infos:
+      if ti.WhichOneof("encoding") == "name":  # Dense tensors only
+        t = pruned_graph.get_tensor_by_name(ti.name)
+        t.set_shape(tensor_shape.TensorShape(ti.tensor_shape))
+    # pylint: disable=protected-access
+    for f in self.graph._functions.values():
+      pruned_graph._add_function(f)
+    # pylint: enable=protected-access
 
     pruned_graph.variables = self.graph.variables
 
diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index ba323328b28..e5485b2b604 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -238,6 +238,7 @@ tf_py_test(
 py_test(
     name = "sequence_feature_column_integration_test",
     srcs = ["sequence_feature_column_integration_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
diff --git a/tensorflow/python/framework/composite_tensor.py b/tensorflow/python/framework/composite_tensor.py
index bb265e091ef..657c47c336d 100644
--- a/tensorflow/python/framework/composite_tensor.py
+++ b/tensorflow/python/framework/composite_tensor.py
@@ -100,7 +100,7 @@ class CompositeTensor(object):
   @abc.abstractproperty
   def _is_graph_tensor(self):
     """Returns True if this tensor's components belong to a TF graph."""
-    raise NotImplementedError("CompositeTensor._is_symbolic_tensor")
+    raise NotImplementedError("CompositeTensor._is_graph_tensor")
 
   def _consumers(self):
     """Returns a list of `Operation`s that consume this `CompositeTensor`.
diff --git a/tensorflow/python/framework/composite_tensor_test.py b/tensorflow/python/framework/composite_tensor_test.py
index 9765b531218..3f2d792dc00 100644
--- a/tensorflow/python/framework/composite_tensor_test.py
+++ b/tensorflow/python/framework/composite_tensor_test.py
@@ -18,6 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import gc
+import sys
+import weakref
 from absl.testing import parameterized
 
 from tensorflow.python.framework import composite_tensor
@@ -324,6 +327,41 @@ class CompositeTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     expected = [CT([True, True], metadata='A'), False]
     self.assertEqual(result, expected)
 
+  def testMemoryIsFreed(self):
+    # Note: we use `set` values for components and metadata because we need
+    # to construct weakrefs to them.  Other builtin types, such as `list` and
+    # `tuple`, do not support weakrefs.
+    ct1 = CT(set([1, 2]), set(['no', 'leaks']))
+    ct2 = CT(set([3, 4]), set(['no', 'leaks']))
+    ct3 = CT(set([5, 6]), set(['other', 'metadata']))
+
+    # Note: map_structure exercises flatten, pack_sequence_as, and
+    # assert_same_structure.
+    func = lambda x, y: x | y
+    ct4 = nest.map_structure(func, ct1, ct2, expand_composites=True)
+
+    # Check that the exception-raising path in assert_same_structure
+    # doesn't leak any objects.
+    with self.assertRaisesRegexp(ValueError,
+                                 ".*don't have the same nested structure.*"):
+      nest.map_structure(func, ct2, ct3, expand_composites=True)
+    if hasattr(sys, 'exc_clear'):
+      sys.exc_clear()  # Remove any references in exception stack traces.
+
+    refs = []
+    for ct in [ct1, ct2, ct3, ct4]:
+      refs.append(weakref.ref(ct))
+      refs.append(weakref.ref(ct.components))
+      refs.append(weakref.ref(ct.metadata))
+    del ct  # pylint: disable=undefined-loop-variable
+
+    for ref in refs:
+      self.assertIsNotNone(ref())
+
+    del ct1, ct2, ct3, ct4
+    gc.collect()
+    for ref in refs:
+      self.assertIsNone(ref())
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/framework/config_test.py b/tensorflow/python/framework/config_test.py
index 31e9cc90ac6..b64407d52f5 100644
--- a/tensorflow/python/framework/config_test.py
+++ b/tensorflow/python/framework/config_test.py
@@ -198,8 +198,10 @@ class ConfigTest(test.TestCase, parameterized.TestCase):
 
     with self.assertRaises(RuntimeError):
       context.set_log_device_placement(True)
-    with self.assertRaises(RuntimeError):
-      context.set_log_device_placement(False)
+
+    # If the setting the device placement is a no-op, do not throw a runtime
+    # exception.
+    context.set_log_device_placement(False)
 
   @test_util.run_gpu_only
   @reset_eager
diff --git a/tensorflow/python/framework/device_spec.py b/tensorflow/python/framework/device_spec.py
index 83e517c2ae4..0d78e47a8ad 100644
--- a/tensorflow/python/framework/device_spec.py
+++ b/tensorflow/python/framework/device_spec.py
@@ -356,6 +356,18 @@ class DeviceSpecV2(object):
     return output
 
   def __eq__(self, other):
+    """Checks if the `other` DeviceSpec is same as the current instance, eg have
+
+       same value for all the internal fields.
+
+    Args:
+      other: Another DeviceSpec
+
+    Returns:
+      Return `True` if `other` is also a DeviceSpec instance and has same value
+      as the current instance.
+      Return `False` otherwise.
+    """
     return (isinstance(other, self.__class__) and
             self.to_string() == other.to_string())
 
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index ffac7d1eb36..d287ea2fcd4 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -352,6 +352,21 @@ class _DefinedFunction(object):
     if self._definition is not None or self._c_func is not None:
       return
 
+    # Copy variable collections (by reference) from the parent graph such that
+    # name based variable sharing (e.g. via tf.make_template) works between the
+    # func graph and parent graph.
+    variable_keys = []
+    variable_keys.extend(ops.GraphKeys._VARIABLE_COLLECTIONS)  # pylint: disable=protected-access
+    variable_keys.append(vs._VARSTORE_KEY)  # pylint: disable=protected-access
+
+    collections_ref = {}
+    parent_collections_ref = ops.get_default_graph()._collections  # pylint: disable=protected-access
+    for key in variable_keys:
+      if key not in parent_collections_ref:
+        parent_collections_ref[key] = collections_ref[key] = []
+      else:
+        collections_ref[key] = parent_collections_ref[key]
+
     temp_graph = func_graph_from_py_func(
         self._func,
         self._arg_names,
@@ -359,6 +374,7 @@ class _DefinedFunction(object):
         self._func_name,
         self._capture_by_value,
         self._caller_device,
+        collections_ref=collections_ref,
         whitelisted_stateful_ops=self._whitelisted_stateful_ops,
         capture_resource_var_by_value=self._capture_resource_var_by_value)
 
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 21c6565ca97..57f50b888f5 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -47,6 +47,7 @@ from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import template
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -1753,13 +1754,64 @@ class VariableHoistingTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def testBasic(self):
-    self._testSimpleModel(True)
     self._testSimpleModel(False)
+    self._testSimpleModel(True)
 
   @test_util.run_deprecated_v1
   def testBasicResource(self):
-    self._testSimpleModel(True, use_resource=True)
     self._testSimpleModel(False, use_resource=True)
+    self._testSimpleModel(True, use_resource=True)
+
+
+class TemplateTest(test.TestCase):
+
+  @test_util.run_v1_only("make_template not supported in TF2")
+  def testBasic(self):
+    self.assertTemplateVariableSharing(use_resource=True, defun_first=False)
+
+  @test_util.run_v1_only("make_template not supported in TF2")
+  def testBasicRef(self):
+    self.assertTemplateVariableSharing(use_resource=False, defun_first=False)
+
+  @test_util.run_v1_only("make_template not supported in TF2")
+  def testBasicDefunFirst(self):
+    self.assertTemplateVariableSharing(use_resource=True, defun_first=True)
+
+  @test_util.run_v1_only("make_template not supported in TF2")
+  def testBasicRefDefunFirst(self):
+    self.assertTemplateVariableSharing(use_resource=False, defun_first=True)
+
+  def assertTemplateVariableSharing(self, use_resource, defun_first):
+    parameters = []
+
+    def MakeModel(x):
+      w = variable_scope.get_variable(
+          "w", (64, 64),
+          initializer=init_ops.random_uniform_initializer(seed=312),
+          use_resource=use_resource)
+      b = variable_scope.get_variable(
+          "b", (64),
+          initializer=init_ops.zeros_initializer(),
+          use_resource=use_resource)
+      parameters.extend((w, b))
+      return math_ops.sigmoid(math_ops.matmul(x, w) + b)
+
+    model = template.make_template("f", MakeModel, create_scope_now_=True)
+
+    @function.Defun()
+    def ModelDefun(x):
+      return model(x)
+
+    x = array_ops.placeholder(dtypes.float32)
+    if defun_first:
+      ModelDefun(x)
+      model(x)
+    else:
+      model(x)
+      ModelDefun(x)
+    w1, b1, w2, b2 = parameters  # pylint: disable=unbalanced-tuple-unpacking
+    self.assertIs(w1, w2)
+    self.assertIs(b1, b2)
 
 
 class DevicePlacementTest(test.TestCase):
diff --git a/tensorflow/python/framework/load_library.py b/tensorflow/python/framework/load_library.py
index 301bb2af856..304745fe3c7 100644
--- a/tensorflow/python/framework/load_library.py
+++ b/tensorflow/python/framework/load_library.py
@@ -134,7 +134,7 @@ def load_library(library_location):
   """Loads a TensorFlow plugin.
 
   "library_location" can be a path to a specific shared object, or a folder.
-  If it is a folder, all sahred objects that are named "libtfkernel*" will be
+  If it is a folder, all shared objects that are named "libtfkernel*" will be
   loaded. When the library is loaded, kernels registered in the library via the
   `REGISTER_*` macros are made available in the TensorFlow process.
 
diff --git a/tensorflow/python/framework/meta_graph.py b/tensorflow/python/framework/meta_graph.py
index fc566ce0b24..c0dc4c720ed 100644
--- a/tensorflow/python/framework/meta_graph.py
+++ b/tensorflow/python/framework/meta_graph.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
+from distutils import version as distutils_version  # pylint: disable=g-bad-import-order
 import os.path
 import re
 
@@ -854,9 +855,30 @@ def import_scoped_meta_graph_with_return_elements(
         producer_op_list=producer_op_list,
         return_elements=return_elements)
 
+    # TensorFlow versions before 1.9 (not inclusive) exported SavedModels
+    # without a VariableDef.trainable field set.
+    tf_version = meta_graph_def.meta_info_def.tensorflow_version
+    if not tf_version:
+      variables_have_trainable = True
+    else:
+      variables_have_trainable = (
+          distutils_version.LooseVersion(tf_version)
+          >= distutils_version.LooseVersion("1.9"))
+
+    # Sort collections so we see TRAINABLE_VARIABLES first and can default these
+    # variables to trainable if the value is not set in their VariableDef.
+    sorted_collections = []
+    if ops.GraphKeys.TRAINABLE_VARIABLES in meta_graph_def.collection_def:
+      sorted_collections.append(
+          (ops.GraphKeys.TRAINABLE_VARIABLES,
+           meta_graph_def.collection_def[ops.GraphKeys.TRAINABLE_VARIABLES]))
+    for key, value in sorted(meta_graph_def.collection_def.items()):
+      if key != ops.GraphKeys.TRAINABLE_VARIABLES:
+        sorted_collections.append((key, value))
+
     # Restores all the other collections.
     variable_objects = {}
-    for key, col_def in sorted(meta_graph_def.collection_def.items()):
+    for key, col_def in sorted_collections:
       # Don't add unbound_inputs to the new graph.
       if key == unbound_inputs_col_name:
         continue
@@ -877,6 +899,14 @@ def import_scoped_meta_graph_with_return_elements(
             if variable is None:
               proto = proto_type()
               proto.ParseFromString(value)
+              if not variables_have_trainable:
+                # If the VariableDef proto does not contain a "trainable"
+                # property because it was exported before that property was
+                # added, we default it to whether the variable is in the
+                # TRAINABLE_VARIABLES collection. We've sorted
+                # TRAINABLE_VARIABLES to be first, so trainable variables will
+                # be created from that collection.
+                proto.trainable = (key == ops.GraphKeys.TRAINABLE_VARIABLES)
               variable = from_proto(
                   proto, import_scope=scope_to_prepend_to_names)
               variable_objects[value] = variable
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index e214877be8d..4e635478aa7 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -197,6 +197,7 @@ py_library(
         "//tensorflow/python/eager:monitoring",
         "//tensorflow/python/keras/distribute",
         "//tensorflow/python/keras/mixed_precision/experimental:autocast_variable",
+        "//tensorflow/python/keras/mixed_precision/experimental:loss_scale_optimizer",
         "//tensorflow/python/keras/mixed_precision/experimental:policy",
         "//tensorflow/python/module",
         "//tensorflow/python/training/tracking:data_structures",
@@ -1270,6 +1271,7 @@ tf_py_test(
     shard_count = 6,
     tags = [
         "no_oss",
+        "noasan",  # TODO(b/132183295): Re-enable this.
         "notsan",
     ],
 )
@@ -1328,6 +1330,20 @@ tf_py_test(
     tags = ["notsan"],
 )
 
+tf_py_test(
+    name = "custom_training_loop_test",
+    size = "medium",
+    srcs = ["custom_training_loop_test.py"],
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    shard_count = 4,
+    tags = ["notsan"],
+)
+
 tf_py_test(
     name = "network_test",
     size = "medium",
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index c4dba8ff6e6..21f6f29233e 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -95,6 +95,9 @@ _SESSION = threading.local()
 # either train mode (learning_phase == 1) or test mode (learning_phase == 0).
 _GRAPH_LEARNING_PHASES = weakref.WeakKeyDictionary()
 
+# This dictionary holds a mapping {graph: set_of_freezable_variables}.
+# Each set tracks objects created via `freezable_variable` in the graph.
+_FREEZABLE_VARS = weakref.WeakKeyDictionary()
 
 # _DUMMY_EAGER_GRAPH is used as a key in _GRAPH_LEARNING_PHASES.
 # We keep a separate reference to it to make sure it does not get removed from
@@ -222,6 +225,7 @@ def clear_session():
   global _GRAPH_VARIABLES  # pylint: disable=global-variable-not-assigned
   global _GRAPH_TF_OPTIMIZERS  # pylint: disable=global-variable-not-assigned
   global _GRAPH
+  global _FREEZABLE_VARS
   _GRAPH = None
   ops.reset_default_graph()
   reset_uids()
@@ -235,6 +239,7 @@ def clear_session():
     _GRAPH_LEARNING_PHASES[graph] = phase
     _GRAPH_VARIABLES.pop(graph, None)
     _GRAPH_TF_OPTIMIZERS.pop(graph, None)
+    _FREEZABLE_VARS.pop(graph, None)
 
 
 @keras_export('keras.backend.manual_variable_initialization')
@@ -951,6 +956,55 @@ def is_placeholder(x):
     return False
 
 
+def freezable_variable(value, shape=None, name=None):
+  """A tensor-like object whose value can be updated only up until execution.
+
+  After creating the freezable variable, you can update its value by calling
+  `var.update_value(new_value)` (similar to a regular variable).
+  Unlike an actual variable, the value used during execution is the current
+  value at the time the execution function (`backend.function()`) was created.
+
+  This is an internal API, expected to be temporary. It is used to implement a
+  mutable `trainable` property for `BatchNormalization` layers, with a frozen
+  value after model compilation.
+
+  We don't use a plain variable in this case because we need the value used
+  in a specific model to be frozen after `compile` has been called
+  (e.g. GAN use case).
+
+  Arguments:
+    value: The initial value for the tensor-like object.
+    shape: The shape for the tensor-like object (cannot be changed).
+    name: The name for the tensor-like object.
+
+  Returns:
+    A tensor-like object with a static value that can be updated via
+    `x.update_value(new_value)`, up until creating an execution function
+    (afterwards the value is fixed).
+  """
+  graph = get_graph()
+  with graph.as_default():
+    x = array_ops.placeholder_with_default(
+        value, shape=shape, name=name)
+    x._initial_value = value
+    x._current_value = value
+
+    def update_value(new_value):
+      x._current_value = new_value
+
+    def get_value():
+      return x._current_value
+
+    x.update_value = update_value
+    x.get_value = get_value
+
+    global _FREEZABLE_VARS
+    if graph not in _FREEZABLE_VARS:
+      _FREEZABLE_VARS[graph] = weakref.WeakSet()
+    _FREEZABLE_VARS[graph].add(x)
+  return x
+
+
 @keras_export('keras.backend.shape')
 def shape(x):
   """Returns the symbolic shape of a tensor or variable.
@@ -3244,6 +3298,9 @@ class EagerExecutionFunction(object):
           # `update.op` may have been None in certain cases.
           updates_ops.append(update)
 
+    self._freezable_vars_to_feed = []
+    self._freezable_vars_values = []
+    freezable_vars_from_keras_graph = _FREEZABLE_VARS.get(global_graph, {})
     with _scratch_graph() as exec_graph:
       global_graph = get_graph()
       if source_graph not in (exec_graph, global_graph):
@@ -3264,6 +3321,18 @@ class EagerExecutionFunction(object):
         legacy_update_ops = [(lifted_map[p], lifted_map.get(p_new, p_new))
                              for p, p_new in legacy_update_ops]
 
+        # Keep track of the value to feed to any "freezable variables"
+        # created in this graph.
+        for old_op, new_op in lifted_map.items():
+          if old_op in freezable_vars_from_keras_graph:
+            frozen_var = old_op
+            if frozen_var._initial_value != frozen_var._current_value:
+              # We only feed a frozen_variable if its value has changed;
+              # otherwise it can rely on the default value of the
+              # underlying placeholder_with_default.
+              self._freezable_vars_to_feed.append(new_op)
+              self._freezable_vars_values.append(frozen_var._current_value)
+
     # Consolidate updates
     with exec_graph.as_default():
       outputs = cast_variables_to_tensor(outputs)
@@ -3272,14 +3341,16 @@ class EagerExecutionFunction(object):
           updates_ops.append(state_ops.assign(p, p_new))
 
       self.inputs, self.outputs = inputs, outputs
+      self._input_references = self.inputs + self._freezable_vars_to_feed
       with ops.control_dependencies(updates_ops):
         self.outputs[0] = array_ops.identity(self.outputs[0])
 
-      exec_graph.inputs = self.inputs + list(exec_graph.captures.values())
+      exec_graph.inputs = self._input_references + list(
+          exec_graph.captures.values())
       exec_graph.outputs = self.outputs
       graph_fn = eager_function.ConcreteFunction(exec_graph)
 
-    graph_fn._num_positional_args = len(self.inputs)
+    graph_fn._num_positional_args = len(self._input_references)
     graph_fn._arg_keywords = []
     self._graph_fn = graph_fn
 
@@ -3293,9 +3364,11 @@ class EagerExecutionFunction(object):
               x.op.inputs[0])
 
   def __call__(self, inputs):
-    inputs = nest.flatten(inputs)
+    input_values = nest.flatten(inputs)
+    if self._freezable_vars_values:
+      input_values = input_values + self._freezable_vars_values
     converted_inputs = []
-    for tensor, value in zip(self.inputs, inputs):
+    for tensor, value in zip(self._input_references, input_values):
       if value is None:
         # Assume `value` is a placeholder with default
         value = self._placeholder_default_values.get(tensor, None)
@@ -5411,6 +5484,8 @@ if not os.path.exists(_config_path):
 
 def in_multi_worker_mode():
   """Whether we are operating in a Multi-Worker setting."""
+  # TODO(rchao): Consider a warning if user uses multiple `model` method
+  # calls in multi-worker setting.
   tf_config = json.loads(os.environ.get('TF_CONFIG', '{}'))
   cluster_spec = server_lib.ClusterSpec(tf_config.get('cluster', {}))
   return tf_config and 'master' not in cluster_spec.jobs
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index f1153bca30f..ef3657c411e 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -51,6 +51,11 @@ try:
 except ImportError:
   requests = None
 
+# Constant for `tf.keras.Model` to store the epoch at which the most recently
+# saved checkpoint was saved. See `Model._get_updated_initial_epoch()`'s
+# docstring for more information.
+CKPT_SAVED_EPOCH = '_ckpt_saved_epoch'
+
 
 def configure_callbacks(callbacks,
                         model,
@@ -111,6 +116,18 @@ def configure_callbacks(callbacks,
       mode=mode)
 
   callback_list.model.stop_training = False
+  # pylint: disable=protected-access
+  if callback_list.model._ckpt_saved_epoch is not None:
+    # The attribute `_ckpt_saved_epoch` is supposed to be None at the start of
+    # training (it should be made None at the end of successful multi-worker
+    # training), unless the user's `fit()` does not end successfully before
+    # making another `fit()` call.
+    raise ValueError(
+        '`tf.Keras.Model._ckpt_saved_epoch` attr should be None at '
+        'callback setup time. Please ensure `fit()` in multi-worker '
+        'training finishes successfully before starting a new one. If the '
+        'issue persists, try using only one `model.fit()` in multi-worker '
+        'training.')
   return callback_list
 
 
@@ -904,17 +921,36 @@ class ModelCheckpoint(Callback):
       # worker setting (e.g. non-chief worker in ParameterServerStrategy).
       return
 
-    if self.load_weights_on_restart:
+    filepath_to_load = self._get_most_recently_modified_file_matching_pattern(
+        self.filepath)
+    if (self.load_weights_on_restart and filepath_to_load is not None and
+        os.path.exists(filepath_to_load)):
       try:
         # `filepath` may contain placeholders such as `{epoch:02d}`, and thus
         # it attempts to load the most recently modified file with file name
         # matching the pattern.
-        self.model.load_weights(
-            self._get_most_recently_modified_file_matching_pattern(
-                self.filepath))
+        self.model.load_weights(filepath_to_load)
       except (IOError, ValueError) as e:
         raise ValueError('Error loading file from {}. Reason: {}'.format(
-            self.filepath, e))
+            filepath_to_load, e))
+
+  def on_train_end(self, logs=None):
+    logs = logs or {}
+    # pylint: disable=protected-access
+    if self.model._ckpt_saved_epoch is not None:
+      # Make `_ckpt_saved_epoch` attribute `None` at the end of training as it
+      # is only used during the training. Currently it is decided not to
+      # support fault tolerance across multiple `model.fit()` or `model.fit()`
+      # with other `model` methods.
+      epoch = self.model._ckpt_saved_epoch
+      self.model._ckpt_saved_epoch = None
+      # TODO(rchao): Support all `save_weights_only` and `save_best_only` cases.
+      # This will be done with the help of a decoupled training state file that
+      # contains both epoch and model weights.
+      if self.save_weights_only and not self.save_best_only:
+        file_handle, filepath = self._get_file_handle_and_path(epoch, logs)
+        self.model.save_weights(filepath, overwrite=True)
+        self._maybe_remove_file(file_handle, filepath)
 
   def on_batch_end(self, batch, logs=None):
     logs = logs or {}
@@ -944,23 +980,7 @@ class ModelCheckpoint(Callback):
     if isinstance(self.save_freq,
                   int) or self.epochs_since_last_save >= self.period:
       self.epochs_since_last_save = 0
-
-      # TODO(rchao): Replace dc_context reference with
-      # distributed_training_utils.should_current_worker_checkpoint() once
-      # distributed_training_utils.py no longer depends on callbacks.py.
-      if not K.in_multi_worker_mode() or dc_context.get_current_worker_context(
-      ).should_checkpoint:
-        filepath = self.filepath.format(epoch=epoch + 1, **logs)
-      else:
-        # If this is multi-worker training, and this worker should not
-        # save checkpoint, we replace the filepath with a dummy filepath so
-        # it writes to a file that will be removed at the end of _save_model()
-        # call. This is because the SyncOnReadVariable needs to be synced across
-        # all the workers in order to be read, and all workers need to initiate
-        # that.
-        file_handle, temp_file_name = tempfile.mkstemp()
-        extension = os.path.splitext(self.filepath)[1]
-        filepath = temp_file_name + extension
+      file_handle, filepath = self._get_file_handle_and_path(epoch, logs)
 
       if self.save_best_only:
         current = logs.get(self.monitor)
@@ -986,24 +1006,67 @@ class ModelCheckpoint(Callback):
         if self.verbose > 0:
           print('\nEpoch %05d: saving model to %s' % (epoch + 1, filepath))
         if self.save_weights_only:
+          if K.in_multi_worker_mode():
+            # TODO(rchao): Save to an additional training state file for FT,
+            # instead of adding an attr to weight file. With this we can support
+            # the cases of all combinations with `save_weights_only`,
+            # `save_best_only`, and `save_format` parameters.
+            # pylint: disable=protected-access
+            self.model._ckpt_saved_epoch = epoch
           self.model.save_weights(filepath, overwrite=True)
         else:
           self.model.save(filepath, overwrite=True)
 
-      # Remove the file in multi-worker training where this worker should
-      # not checkpoint.
-      if K.in_multi_worker_mode(
-      ) and not dc_context.get_current_worker_context().should_checkpoint:
-        os.close(file_handle)
-        os.remove(filepath)
+      self._maybe_remove_file(file_handle, filepath)
+
+  def _get_file_handle_and_path(self, epoch, logs):
+    """Returns the file handle and path."""
+    # TODO(rchao): Replace dc_context reference with
+    # distributed_training_utils.should_current_worker_checkpoint() once
+    # distributed_training_utils.py no longer depends on callbacks.py.
+    if not K.in_multi_worker_mode() or dc_context.get_current_worker_context(
+    ).should_checkpoint:
+      return None, self.filepath.format(epoch=epoch + 1, **logs)
+    else:
+      # If this is multi-worker training, and this worker should not
+      # save checkpoint, we replace the filepath with a dummy filepath so
+      # it writes to a file that will be removed at the end of _save_model()
+      # call. This is because the SyncOnReadVariable needs to be synced across
+      # all the workers in order to be read, and all workers need to initiate
+      # that.
+      file_handle, temp_file_name = tempfile.mkstemp()
+      extension = os.path.splitext(self.filepath)[1]
+      return file_handle, temp_file_name + '.' + extension
+
+  def _maybe_remove_file(self, file_handle, filepath):
+    # Remove the file in multi-worker training where this worker should
+    # not checkpoint. It is a dummy file previously saved for sync distributed
+    # training.
+    if K.in_multi_worker_mode(
+    ) and not dc_context.get_current_worker_context().should_checkpoint:
+      os.close(file_handle)
+      os.remove(filepath)
 
   def _get_most_recently_modified_file_matching_pattern(self, pattern):
     """Returns the most recently modified filepath matching pattern.
 
     Pattern may contain python formatting placeholder. If
     `tf.train.latest_checkpoint()` does not return None, use that; otherwise,
-    check for most recently modified one that matches the pattern. This utility
-    function is best demonstrated via an example:
+    check for most recently modified one that matches the pattern.
+
+    In the rare case where there are more than one pattern-matching file having
+    the same modified time that is most recent among all, return the filepath
+    that is largest (by `>` operator, lexicographically using the numeric
+    equivalents). This provides a tie-breaker when multiple files are most
+    recent. Note that a larger `filepath` can sometimes indicate a later time of
+    modification (for instance, when epoch/batch is used as formatting option),
+    but not necessarily (when accuracy or loss is used). The tie-breaker is
+    put in the logic as best effort to return the most recent, and to avoid
+    undeterministic result.
+
+    Modified time of a file is obtained with `os.path.getmtime()`.
+
+    This utility function is best demonstrated via an example:
 
     ```python
     file_pattern = 'f.batch{batch:02d}epoch{epoch:02d}.h5'
@@ -1043,14 +1106,36 @@ class ModelCheckpoint(Callback):
 
     latest_mod_time = 0
     file_path_with_latest_mod_time = None
+    n_file_with_latest_mod_time = 0
+    file_path_with_largest_file_name = None
+
     for file_name in os.listdir(dir_name):
+      # Only consider if `file_name` matches the pattern.
       if re.match(base_name_regex, file_name):
         file_path = os.path.join(dir_name, file_name)
         mod_time = os.path.getmtime(file_path)
+        if (file_path_with_largest_file_name is None or
+            file_path > file_path_with_largest_file_name):
+          file_path_with_largest_file_name = file_path
         if mod_time > latest_mod_time:
           latest_mod_time = mod_time
           file_path_with_latest_mod_time = file_path
-    return file_path_with_latest_mod_time
+          # In the case a file with later modified time is found, reset
+          # the counter for the number of files with latest modified time.
+          n_file_with_latest_mod_time = 1
+        elif mod_time == latest_mod_time:
+          # In the case a file has modified time tied with the most recent,
+          # increment the counter for the number of files with latest modified
+          # time by 1.
+          n_file_with_latest_mod_time += 1
+
+    if n_file_with_latest_mod_time == 1:
+      # Return the sole file that has most recent modified time.
+      return file_path_with_latest_mod_time
+    else:
+      # If there are more than one file having latest modified time, return
+      # the file path with the largest file name.
+      return file_path_with_largest_file_name
 
 
 @keras_export('keras.callbacks.EarlyStopping')
@@ -1239,6 +1324,20 @@ class LearningRateScheduler(Callback):
           (integer, indexed from 0) and returns a new
           learning rate as output (float).
       verbose: int. 0: quiet, 1: update messages.
+
+  ```python
+  # This function keeps the learning rate at 0.001 for the first ten epochs
+  # and decreases it exponentially after that.
+  def scheduler(epoch):
+    if epoch < 10:
+      return 0.001
+    else:
+      return 0.001 * tf.math.exp(0.1 * (10 - epoch))
+
+  callback = tf.keras.callbacks.LearningRateScheduler(scheduler)
+  model.fit(data, labels, epochs=100, callbacks=[callback],
+            validation_data=(val_data, val_labels))
+  ```
   """
 
   def __init__(self, schedule, verbose=0):
@@ -1310,6 +1409,14 @@ class TensorBoard(Callback):
       profile_batch: Profile the batch to sample compute characteristics. By
         default, it will profile the second batch. Set profile_batch=0 to
         disable profiling. Must run in TensorFlow eager mode.
+      embeddings_freq: frequency (in epochs) at which embedding layers will
+        be visualized. If set to 0, embeddings won't be visualized.
+      embeddings_metadata: a dictionary which maps layer name to a file name in
+        which metadata for this embedding layer is saved. See the
+        [details](
+          https://www.tensorflow.org/how_tos/embedding_viz/#metadata_optional)
+        about metadata files format. In case if the same metadata file is
+        used for all embedding layers, string can be passed.
 
   Raises:
       ValueError: If histogram_freq is set and no validation data is provided.
@@ -1324,6 +1431,8 @@ class TensorBoard(Callback):
                write_images=False,
                update_freq='epoch',
                profile_batch=2,
+               embeddings_freq=0,
+               embeddings_metadata=None,
                **kwargs):
     super(TensorBoard, self).__init__()
     self._validate_kwargs(kwargs)
@@ -1336,6 +1445,8 @@ class TensorBoard(Callback):
       self.update_freq = 1
     else:
       self.update_freq = update_freq
+    self.embeddings_freq = embeddings_freq
+    self.embeddings_metadata = embeddings_metadata
 
     self._samples_seen = 0
     self._samples_seen_at_last_write = 0
@@ -1364,17 +1475,21 @@ class TensorBoard(Callback):
     if kwargs.get('write_grads', False):
       logging.warning('`write_grads` will be ignored in TensorFlow 2.0 '
                       'for the `TensorBoard` Callback.')
-    if kwargs.get('embeddings_freq', False):
-      logging.warning('Embeddings will be ignored in TensorFlow 2.0 '
-                      'for the `TensorBoard` Callback.')
     if kwargs.get('batch_size', False):
       logging.warning('`batch_size` is no longer needed in the '
                       '`TensorBoard` Callback and will be ignored '
                       'in TensorFlow 2.0.')
+    if kwargs.get('embeddings_layer_names', False):
+      logging.warning('`embeddings_layer_names` is not supported in '
+                      'TensorFlow 2.0. Instead, all `Embedding` layers '
+                      'will be visualized.')
+    if kwargs.get('embeddings_data', False):
+      logging.warning('`embeddings_data` is not supported in TensorFlow '
+                      '2.0. Instead, all `Embedding` variables will be '
+                      'visualized.')
 
     unrecognized_kwargs = set(kwargs.keys()) - {
-        'write_grads', 'embeddings_freq', 'embeddings_layer_names',
-        'embeddings_metadata', 'embeddings_data', 'batch_size'
+        'write_grads', 'embeddings_layer_names', 'embeddings_data', 'batch_size'
     }
 
     # Only allow kwargs that were supported in V1.
@@ -1399,6 +1514,48 @@ class TensorBoard(Callback):
             if summary_writable:
               summary_ops_v2.keras_model('keras', self.model, step=0)
 
+    if self.embeddings_freq:
+      self._configure_embeddings()
+
+  def _configure_embeddings(self):
+    """Configure the Projector for embeddings."""
+    # TODO(omalleyt): Add integration tests.
+    from tensorflow.python.keras.layers import embeddings
+    try:
+      from tensorboard.plugins import projector
+    except ImportError:
+      raise ImportError('Failed to import TensorBoard. Please make sure that '
+                        'TensorBoard integration is complete."')
+    config = projector.ProjectorConfig()
+    for layer in self.model.layers:
+      if isinstance(layer, embeddings.Embedding):
+        embedding = config.embeddings.add()
+        embedding.tensor_name = layer.embeddings.name
+
+        if self.embeddings_metadata is not None:
+          if isinstance(self.embeddings_metadata, str):
+            embedding.metadata_path = self.embeddings_metadata
+          else:
+            if layer.name in embedding.metadata_path:
+              embedding.metadata_path = self.embeddings_metadata.pop(layer.name)
+
+    if self.embeddings_metadata:
+      raise ValueError('Unrecognized `Embedding` layer names passed to '
+                       '`keras.callbacks.TensorBoard` `embeddings_metadata` '
+                       'argument: ' + str(self.embeddings_metadata.keys()))
+
+    class DummyWriter(object):
+      """Dummy writer to conform to `Projector` API."""
+
+      def __init__(self, logdir):
+        self.logdir = logdir
+
+      def get_logdir(self):
+        return self.logdir
+
+    writer = DummyWriter(self.log_dir)
+    projector.visualize_embeddings(writer, config)
+
   def _close_writers(self):
     """Close all remaining open file writers owned by this callback.
 
@@ -1464,6 +1621,9 @@ class TensorBoard(Callback):
     if self.histogram_freq and epoch % self.histogram_freq == 0:
       self._log_weights(epoch)
 
+    if self.embeddings_freq and epoch % self.embeddings_freq == 0:
+      self._log_embeddings(epoch)
+
   def on_train_end(self, logs=None):
     if self._is_tracing:
       self._log_trace()
@@ -1568,6 +1728,11 @@ class TensorBoard(Callback):
     if len(shape) == 4 and shape[-1] in [1, 3, 4]:
       summary_ops_v2.image(weight_name, w_img, step=epoch)
 
+  def _log_embeddings(self, epoch):
+    embeddings_ckpt = os.path.join(self.log_dir, 'train',
+                                   'keras_embedding.ckpt-{}'.format(epoch))
+    self.model.save_weights(embeddings_ckpt)
+
 
 @keras_export('keras.callbacks.ReduceLROnPlateau')
 class ReduceLROnPlateau(Callback):
@@ -1588,22 +1753,20 @@ class ReduceLROnPlateau(Callback):
 
   Arguments:
       monitor: quantity to be monitored.
-      factor: factor by which the learning rate will
-          be reduced. new_lr = lr * factor
-      patience: number of epochs with no improvement
-          after which learning rate will be reduced.
+      factor: factor by which the learning rate will be reduced. new_lr = lr *
+        factor
+      patience: number of epochs with no improvement after which learning rate
+        will be reduced.
       verbose: int. 0: quiet, 1: update messages.
-      mode: one of {auto, min, max}. In `min` mode,
-          lr will be reduced when the quantity
-          monitored has stopped decreasing; in `max`
-          mode it will be reduced when the quantity
-          monitored has stopped increasing; in `auto`
-          mode, the direction is automatically inferred
-          from the name of the monitored quantity.
-      min_delta: threshold for measuring the new optimum,
-          to only focus on significant changes.
-      cooldown: number of epochs to wait before resuming
-          normal operation after lr has been reduced.
+      mode: one of {auto, min, max}. In `min` mode, lr will be reduced when the
+        quantity monitored has stopped decreasing; in `max` mode it will be
+        reduced when the quantity monitored has stopped increasing; in `auto`
+        mode, the direction is automatically inferred from the name of the
+        monitored quantity.
+      min_delta: threshold for measuring the new optimum, to only focus on
+        significant changes.
+      cooldown: number of epochs to wait before resuming normal operation after
+        lr has been reduced.
       min_lr: lower bound on the learning rate.
   """
 
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index 5f866414c83..2ccee6ded73 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -559,7 +559,6 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
     for epoch in range(initial_epochs):
       self.assertTrue(os.path.exists(filepath.format(epoch=epoch + 1)))
     self.assertFalse(os.path.exists(filepath.format(epoch=initial_epochs + 1)))
-    self.skipTest('b/131852849')
     self.assertEqual(
         callback._get_most_recently_modified_file_matching_pattern(filepath),
         filepath.format(epoch=initial_epochs))
@@ -580,6 +579,9 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
       (model, train_ds, filepath, weights_after_one_more_epoch
       ) = self._run_load_weights_on_restart_test_common_iterations()
 
+      # Sleep for some short time period ensuring the files are created with
+      # a different time (in MacOS OSS the granularity is only 1 second).
+      time.sleep(2)
       callback = keras.callbacks.ModelCheckpoint(
           filepath=filepath,
           save_weights_only=save_weights_only,
@@ -655,6 +657,9 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
     (model, train_ds, filepath,
      _) = self._run_load_weights_on_restart_test_common_iterations()
 
+    # Sleep for some short time period to ensure the files are created with
+    # a different time (in MacOS OSS the granularity is only 1 second).
+    time.sleep(2)
     callback = keras.callbacks.ModelCheckpoint(
         filepath=filepath, save_weights_only=True)
     model.load_weights(
diff --git a/tensorflow/python/keras/custom_training_loop_test.py b/tensorflow/python/keras/custom_training_loop_test.py
new file mode 100644
index 00000000000..d2b82c8a55f
--- /dev/null
+++ b/tensorflow/python/keras/custom_training_loop_test.py
@@ -0,0 +1,148 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for custom training loops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import ops
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class LayerWithLosses(keras.layers.Layer):
+
+  def build(self, input_shape):
+    self.v = self.add_weight(
+        name='hey',
+        shape=(),
+        initializer='ones',
+        regularizer=keras.regularizers.l1(100))
+
+  def call(self, inputs):
+    self.add_loss(math_ops.reduce_sum(inputs))
+    return self.v * inputs
+
+
+class LayerWithMetrics(keras.layers.Layer):
+
+  def build(self, input_shape):
+    self.mean = keras.metrics.Mean(name='mean_object')
+
+  def call(self, inputs):
+    self.add_metric(
+        math_ops.reduce_mean(inputs), name='mean_tensor', aggregation='mean')
+    self.add_metric(self.mean(inputs))
+    return inputs
+
+
+def add_loss_step(defun):
+  optimizer = keras.optimizer_v2.adam.Adam()
+  model = testing_utils.get_model_from_layers([LayerWithLosses()],
+                                              input_shape=(10,))
+
+  def train_step(x):
+    with backprop.GradientTape() as tape:
+      model(x)
+      assert len(model.losses) == 2
+      loss = math_ops.reduce_sum(model.losses)
+    gradients = tape.gradient(loss, model.trainable_weights)
+    optimizer.apply_gradients(zip(gradients, model.trainable_weights))
+    return loss
+
+  if defun:
+    train_step = def_function.function(train_step)
+
+  x = array_ops.ones((10, 10))
+  return train_step(x)
+
+
+def batch_norm_step(defun):
+  optimizer = keras.optimizer_v2.adadelta.Adadelta()
+  model = testing_utils.get_model_from_layers([
+      keras.layers.BatchNormalization(momentum=0.9),
+      keras.layers.Dense(1, kernel_initializer='zeros', activation='softmax')
+  ],
+                                              input_shape=(10,))
+
+  def train_step(x, y):
+    with backprop.GradientTape() as tape:
+      y_pred = model(x, training=True)
+      loss = keras.losses.binary_crossentropy(y, y_pred)
+    gradients = tape.gradient(loss, model.trainable_weights)
+    optimizer.apply_gradients(zip(gradients, model.trainable_weights))
+    return loss, model(x, training=False)
+
+  if defun:
+    train_step = def_function.function(train_step)
+
+  x, y = array_ops.ones((10, 10)), array_ops.ones((10, 1))
+  return train_step(x, y)
+
+
+def add_metric_step(defun):
+  optimizer = keras.optimizer_v2.rmsprop.RMSprop()
+  model = testing_utils.get_model_from_layers([
+      LayerWithMetrics(),
+      keras.layers.Dense(1, kernel_initializer='zeros', activation='softmax')
+  ],
+                                              input_shape=(10,))
+
+  def train_step(x, y):
+    with backprop.GradientTape() as tape:
+      y_pred_1 = model(x)
+      y_pred_2 = model(2 * x)
+      y_pred = y_pred_1 + y_pred_2
+      loss = keras.losses.mean_squared_error(y, y_pred)
+    gradients = tape.gradient(loss, model.trainable_weights)
+    optimizer.apply_gradients(zip(gradients, model.trainable_weights))
+    assert len(model.metrics) == 2
+    return [m.result() for m in model.metrics]
+
+  if defun:
+    train_step = def_function.function(train_step)
+
+  x, y = array_ops.ones((10, 10)), array_ops.zeros((10, 1))
+  metrics = train_step(x, y)
+  assert np.allclose(metrics[0], 1.5)
+  assert np.allclose(metrics[1], 1.5)
+  return metrics
+
+
+@keras_parameterized.run_with_all_model_types
+class CustomTrainingLoopTest(keras_parameterized.TestCase):
+
+  @parameterized.named_parameters(('add_loss_step', add_loss_step),
+                                  ('add_metric_step', add_metric_step),
+                                  ('batch_norm_step', batch_norm_step))
+  def test_eager_and_tf_function(self, train_step):
+    eager_result = train_step(defun=False)
+    fn_result = train_step(defun=True)
+    self.assertAllClose(eager_result, fn_result)
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index 415016bebad..dc7b1e03b59 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -327,15 +327,17 @@ cuda_py_test(
         "//tensorflow/python/distribute:distribute_coordinator",
         "//tensorflow/python/keras",
     ],
-    shard_count = 4,
+    shard_count = 14,
     tags = [
         "multi_and_single_gpu",
+        "no_oss",  # TODO(b/132384649): Flakily times out.
     ],
 )
 
 py_binary(
     name = "mnist_multi_worker",
     srcs = ["mnist_multi_worker.py"],
+    python_version = "PY2",
     deps = [":mnist_multi_worker_lib"],
 )
 
diff --git a/tensorflow/python/keras/distribute/distribute_strategy_test.py b/tensorflow/python/keras/distribute/distribute_strategy_test.py
index dd8462fcf0f..99540677724 100644
--- a/tensorflow/python/keras/distribute/distribute_strategy_test.py
+++ b/tensorflow/python/keras/distribute/distribute_strategy_test.py
@@ -35,6 +35,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.distribute import distributed_training_utils
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
+from tensorflow.python.keras.optimizer_v2 import rmsprop as rmsprop_keras
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.parsing_ops import gen_parsing_ops
@@ -115,7 +116,7 @@ def multi_inputs_multi_outputs_model():
       inputs=[input_a, input_b, input_m], outputs=[output_c, output_d])
   model.compile(
       loss='categorical_crossentropy',
-      optimizer=gradient_descent.GradientDescentOptimizer(0.001),
+      optimizer=gradient_descent_keras.SGD(learning_rate=0.001),
       metrics={
           'dense_2': 'categorical_accuracy',
           'dense_3': 'categorical_accuracy'
@@ -371,7 +372,7 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase,
     keras_model.compile(
         loss='categorical_crossentropy',
         metrics=[keras.metrics.CategoricalAccuracy()],
-        optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.01),
+        optimizer=rmsprop_keras.RMSprop(learning_rate=0.01),
         cloning=cloning)
     config = run_config_lib.RunConfig(
         tf_random_seed=_RANDOM_SEED,
@@ -405,7 +406,7 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase,
     keras_model.compile(
         loss='categorical_crossentropy',
         metrics=[keras.metrics.CategoricalAccuracy()],
-        optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.01),
+        optimizer=rmsprop_keras.RMSprop(learning_rate=0.01),
         cloning=cloning)
     config = run_config_lib.RunConfig(
         tf_random_seed=_RANDOM_SEED,
@@ -477,36 +478,6 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase,
       eval_results = est_keras.evaluate(input_fn=eval_input_fn, steps=1)
       self.assertLess(eval_results['loss'], baseline_eval_results['loss'])
 
-  @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_gpu_and_cpu
-          ],
-          mode=['graph'],
-          cloning=[True, False]))
-  def test_keras_optimizer_with_distribution_strategy(self, distribution,
-                                                      cloning):
-    keras_model = simple_sequential_model()
-    keras_model.compile(
-        loss='categorical_crossentropy',
-        optimizer=keras.optimizers.rmsprop(lr=0.01),
-        cloning=cloning)
-
-    config = run_config_lib.RunConfig(
-        tf_random_seed=_RANDOM_SEED,
-        model_dir=self._base_dir,
-        train_distribute=distribution)
-    with self.cached_session():
-      est_keras = keras_lib.model_to_estimator(
-          keras_model=keras_model, config=config)
-      with self.assertRaisesRegexp(ValueError,
-                                   'Only TensorFlow native optimizers are '
-                                   'supported with DistributionStrategy.'):
-        est_keras.train(input_fn=get_ds_train_input_fn, steps=_TRAIN_SIZE / 16)
-
-    writer_cache.FileWriterCache.clear()
-    gfile.DeleteRecursively(self._config.model_dir)
-
 
 class TestDistributionStrategyWithNumpyArrays(test.TestCase,
                                               parameterized.TestCase):
@@ -1659,12 +1630,12 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     x = np.ones((64, 10)).astype('float32')
 
     model = _make_model_with_add_loss()
-    model.compile('sgd', cloning=cloning)
+    model.compile('sgd')
     history = model.fit(x, steps_per_epoch=2, epochs=1)
 
     with distribution.scope():
       ds_model = _make_model_with_add_loss()
-      ds_model.compile('sgd')
+      ds_model.compile('sgd', cloning=cloning)
       ds_history = ds_model.fit(x, steps_per_epoch=2, epochs=1)
 
     self.assertAllClose(history.history, ds_history.history)
diff --git a/tensorflow/python/keras/distribute/distributed_training_utils.py b/tensorflow/python/keras/distribute/distributed_training_utils.py
index f3d29cf44ab..5ecdf883c21 100644
--- a/tensorflow/python/keras/distribute/distributed_training_utils.py
+++ b/tensorflow/python/keras/distribute/distributed_training_utils.py
@@ -618,7 +618,8 @@ def is_distributing_by_cloning(model):
     True if the `model` is going to be distributed using cloning and False
     otherwise.
   """
-  return (model._cloning or not context.executing_eagerly() or
+  return (model._cloning or model._compile_distribution or
+          not context.executing_eagerly() or
           K.is_tpu_strategy(model._distribution_strategy))
 
 
diff --git a/tensorflow/python/keras/distribute/multi_worker_callback_test.py b/tensorflow/python/keras/distribute/multi_worker_callback_test.py
index 263bafcec74..efa7d9beec4 100644
--- a/tensorflow/python/keras/distribute/multi_worker_callback_test.py
+++ b/tensorflow/python/keras/distribute/multi_worker_callback_test.py
@@ -17,10 +17,11 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
+import json
 import os
 import sys
 import tempfile
+import threading
 
 from absl.testing import parameterized
 
@@ -351,6 +352,258 @@ class KerasMultiWorkerCallbackTest(test_base.IndependentWorkerTestBase,
                   load_weights_on_restart=True)
           ])
 
+  class PreemptionAtBatchBoundarySimulatingCallback(callbacks.Callback):
+    """Callback to simulate preemtion at batch boundary."""
+
+    def on_epoch_begin(self, epoch, logs=None):
+      self._current_epoch = epoch
+
+    def on_batch_begin(self, batch, logs=None):
+      if self._current_epoch == 1 and batch == 1 and not test_base.is_chief():
+        # Simulate preemtion at the start of second batch of second epoch.
+        raise RuntimeError('Preemption!')
+
+    def on_batch_end(self, batch, logs=None):
+      assert self._current_epoch < 1 or batch < 1
+
+    def on_epoch_end(self, epoch, logs=None):
+      assert epoch < 1
+
+  class PreemptionAtEpochBoundarySimulatingCallback(callbacks.Callback):
+    """Callback to simulate preemtion at epoch boundary."""
+
+    def on_epoch_begin(self, epoch, logs=None):
+      if epoch == 1 and not test_base.is_chief():
+        # Simulate preemtion at the start of second epoch.
+        raise RuntimeError('Preemption!')
+
+    def on_epoch_end(self, epoch, logs=None):
+      assert epoch < 1
+
+  @combinations.generate(
+      combinations.combine(
+          mode=['graph'],
+          strategy_cls=[collective_strategy.CollectiveAllReduceStrategy],
+          required_gpus=[0, 1],
+          file_format=['h5'],  # TODO(rchao): Support TF format.
+          preemption_callback=[
+              PreemptionAtEpochBoundarySimulatingCallback,
+              PreemptionAtBatchBoundarySimulatingCallback
+          ]))
+  def testFaultToleranceInSyncStrategy(self, strategy_cls, file_format,
+                                       preemption_callback):
+    """Test fault-tolerance with multi-threading using sync dist-strat.
+
+    This test simulates multi-worker training that is interrupted by a
+    preemption, by having two threads, each of which represents a chief and a
+    non-chief worker, where the non-chief raises an error in the middle of
+    training loop. Upon excepting the error, a new thread with a new cluster
+    spec is created to simulate the recovered non-chief worker. Meanwhile, the
+    chief worker cannot proceed and hangs since the non-chief worker has
+    crashed. To simulate a restart of the chief, a new thread has been prepared
+    to run to take over chief with the help of a condition variable. It is
+    expected that after the restart of both chief and non-chief workers, the
+    training continues from the epoch they previously failed at. The test
+    concludes by verifying the preemption-interrupted training can finish with
+    the same loss and accuracy had the preemption not occurred.
+
+    Arguments:
+      strategy_cls: The strategy class to use.
+      file_format: `h5` or `tf`.
+      preemption_callback: The callback to simulate preemption.
+    """
+
+    def _independent_worker_fn(*args, **kwargs):  # pylint: disable=unused-argument
+      with test.mock.patch.object(dc, '_run_std_server',
+                                  self._make_mock_run_std_server()):
+        # Condition variable that blocks the thread that represents the
+        # restarted chief.
+        cv = kwargs.get('cv', None)
+        # `before_restart` is True for the threads that represent the original
+        # chief and non-chief worker, and False for threads that represent the
+        # restarted chief and non-chief workers.
+        before_restart = kwargs['before_restart']
+        if kwargs['new_chief']:
+          # `new_chief` is only True for the restarted chief thread. It waits
+          # until non-chief is preempted and restarted to simulate the causality
+          # where chief's restart results from non-chief's failure.
+          cv.acquire()
+          while not hasattr(cv, 'preempted'):
+            cv.wait()
+          cv.release()
+
+        # Model building under strategy scope. Following is the code we expect
+        # the user runs on every worker.
+        strategy = get_strategy_object(strategy_cls)
+        batch_size = 64
+        steps = 3
+        train_ds, _ = _mnist_synthetic_dataset(batch_size, steps)
+        with strategy.scope():
+          model = _get_model((28, 28, 1))
+
+        # Function to start a new thread. This will be called twice in the
+        # following code: one represents the restart of the non-chief, and one
+        # represents the restart of the chief as a result of the restart of the
+        # non-chief (so the training can continue in sync).
+        def start_new_thread(new_chief=False):
+          new_thread_tf_config = json.loads(os.environ['TF_CONFIG'])
+          new_thread_tf_config['cluster']['worker'] = kwargs['reserved_ports']
+          return self._run_task_in_thread(
+              task_fn=_independent_worker_fn,
+              cluster_spec=None,
+              task_type=None,
+              task_id=None,
+              tf_config=new_thread_tf_config,
+              before_restart=False,
+              cv=cv,
+              new_chief=new_chief)
+
+        if test_base.is_chief() and before_restart:
+          # Chief to start a new thread (that will be blocked by a condition
+          # variable until the non-chief's new thread is started). The thread
+          # for (recovered) chief is started before entering `fit()` because
+          # the original chief thread will eventually hang and be ignored.
+          start_new_thread(new_chief=True)
+
+        try:
+
+          class CkptSavedEpochAssertingCallback(callbacks.Callback):
+
+            def __init__(self, test_obj):
+              super(CkptSavedEpochAssertingCallback, self).__init__()
+              self.test_obj = test_obj
+
+            def on_epoch_begin(self, epoch, logs=None):
+              # `_ckpt_saved_epoch` attribute is set at the end of every epoch.
+              self.test_obj.assertEqual(self.model._ckpt_saved_epoch is None,
+                                        epoch == 0)
+
+          callbacks_list = [
+              callbacks.ModelCheckpoint(
+                  filepath=saving_filepath,
+                  save_weights_only=True,
+                  load_weights_on_restart=True),
+              CkptSavedEpochAssertingCallback(self)
+          ]
+          if before_restart:
+            callbacks_list.append(preemption_callback())
+
+          self.assertIsNone(model._ckpt_saved_epoch)
+          history = model.fit(
+              x=train_ds,
+              epochs=num_epoch,
+              steps_per_epoch=steps,
+              callbacks=callbacks_list)
+          self.assertIsNone(model._ckpt_saved_epoch)
+
+          # `history` of the training result is collected to be compared against
+          # each other. It is expected that the training results (loss and
+          # accuracy`) are the same with or without preemption.
+          self._histories.append(history.history)
+
+        except RuntimeError:
+          # pylint: disable=g-assert-in-except
+          self.assertTrue(before_restart)
+          # Reset the barrier so the new threads simulating recovery can
+          # continue.
+          self._barrier._counter = 0
+          self._barrier._flag = False
+
+          # Now that the non-chief has been preempted, it notifies the thread
+          # that simulates the restarted chief to start so they can be back in
+          # sync.
+          cv.acquire()
+          cv.preempted = True
+          cv.notify()
+          cv.release()
+
+          # At this point we should discard the original non-chief thread, and
+          # start the new thread that simulates the restarted non-chief, hence
+          # joining the thread and return.
+          self.join_independent_workers([start_new_thread()])
+          return
+
+        # Successful end of a `fit()` call.
+        self._successful_thread_ends += 1
+        self.assertFalse(before_restart)
+
+    # Common parameters
+    num_workers = 2
+    num_epoch = 3
+    # History list storing the results for preemption and no preemption cases.
+    self._histories = []
+    # Pass `saving_filepath` from the parent thread to ensure every worker has
+    # the same filepath to save.
+    saving_filepath = os.path.join(self.get_temp_dir(),
+                                   'checkpoint.' + file_format)
+    strategy = get_strategy_object(strategy_cls)
+
+    # Case 1: Training for `num_epoch` without preemptions.
+    cluster_spec = test_base.create_cluster_spec(num_workers=num_workers)
+    self._barrier = dc._Barrier(2)
+    self._successful_thread_ends = 0
+    threads = self.run_multiple_tasks_in_threads(
+        _independent_worker_fn,
+        cluster_spec,
+        saving_filepath=saving_filepath,
+        before_restart=False,
+        new_chief=False)
+    if os.path.exists(saving_filepath):
+      os.remove(saving_filepath)
+    threads_to_join = []
+    if strategy.extended.experimental_between_graph:
+      for ts in threads.values():
+        threads_to_join.extend(ts)
+    else:
+      threads_to_join = [threads['worker'][0]]
+    self.join_independent_workers(threads_to_join)
+    self.assertEqual(self._successful_thread_ends, 2)
+
+    # Case 2: Training for `num_epoch` epoch with preemptions.
+    # The preemption is simulated at both epoch boundary and batch boundary.
+    cluster_spec = test_base.create_cluster_spec(num_workers=num_workers)
+    cv = threading.Condition()
+    self._barrier = dc._Barrier(2)
+    # Ports reserved for new threads simulating recovery.
+    reserved_ports = [
+        'localhost:%s' % test_base.pick_unused_port()
+        for _ in range(num_workers)
+    ]
+    self._successful_thread_ends = 0
+    threads = self.run_multiple_tasks_in_threads(
+        _independent_worker_fn,
+        cluster_spec,
+        saving_filepath=saving_filepath,
+        reserved_ports=reserved_ports,
+        before_restart=True,
+        cv=cv,
+        new_chief=False)
+    if os.path.exists(saving_filepath):
+      os.remove(saving_filepath)
+    threads_to_join = []
+    if strategy.extended.experimental_between_graph:
+      # Only join the non-chief thread since the first thread for chief will
+      # eventually hang and be ignored.
+      threads_to_join = [threads['worker'][1]]
+    else:
+      threads_to_join = [threads['worker'][0]]
+    self.join_independent_workers(threads_to_join)
+    self.assertEqual(self._successful_thread_ends, 2)
+
+    def assert_all_elements_are_identical(list_to_check):
+      first_item = list_to_check[0]
+      for item in list_to_check[1:]:
+        self.assertAllClose(first_item, item, rtol=1e-5, atol=1e-5)
+
+    # Important: the results from preemption interrupted and non-interrupted
+    # cases should give the same final results.
+    assert_all_elements_are_identical(
+        [history['acc'][-1] for history in self._histories])
+    assert_all_elements_are_identical(
+        [history['loss'][-1] for history in self._histories])
+    # The length of `self._histories` would be num_workers * num_runs (3).
+    self.assertLen(self._histories, 4)
+
   # The actual testing methods go here.
   test_chief_only_callback = generate_callback_test_function(
       callableForTestChiefOnlyCallback.__func__)
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index b98f9344b4f..dffbf6a1314 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -157,8 +157,8 @@ class Layer(module.Module):
 
     # Mutable properties
     # Indicates whether the layer's weights are updated during training
-    # and whether the layer's updates are run during training
-    self.trainable = trainable
+    # and whether the layer's updates are run during training.
+    self._trainable = trainable
     # A stateful layer is a layer whose updates are run during inference too,
     # for instance stateful RNNs.
     self.stateful = False
@@ -196,10 +196,6 @@ class Layer(module.Module):
     self._metrics_tensors = {}
 
     self._set_dtype_and_policy(dtype)
-
-    self._call_fn_args = function_utils.fn_args(self.call)
-    self._compute_previous_mask = ('mask' in self._call_fn_args or
-                                   hasattr(self, 'compute_mask'))
     self._call_convention = (base_layer_utils
                              .CallConvention.EXPLICIT_INPUTS_ARGUMENT)
     # Dependencies tracked via attribute assignment.
@@ -565,11 +561,8 @@ class Layer(module.Module):
 
     # Handle Keras mask propagation from previous layer to current layer.
     previous_mask = None
-    if (not hasattr(self, '_compute_previous_mask') or
-        self._compute_previous_mask):
+    if self._should_compute_mask:
       previous_mask = base_layer_utils.collect_previous_mask(inputs)
-      if not hasattr(self, '_call_fn_args'):
-        self._call_fn_args = function_utils.fn_args(self.call)
       if ('mask' in self._call_fn_args and 'mask' not in kwargs and
           not generic_utils.is_all_none(previous_mask)):
         # The previous layer generated a mask, and mask was not explicitly
@@ -579,7 +572,7 @@ class Layer(module.Module):
     # Clear eager losses on top level model call.
     # We are clearing the losses only on the top level model call and not on
     # every layer/mode call because layer/model may be reused.
-    if (context.executing_eagerly() and
+    if (base_layer_utils.is_in_eager_or_tf_function() and
         not base_layer_utils.is_in_call_context()):
       self._clear_losses()
 
@@ -627,7 +620,10 @@ class Layer(module.Module):
               with base_layer_utils.autocast_context_manager(
                   input_list,
                   self._mixed_precision_policy.should_cast_variables):
-                if ops.executing_eagerly_outside_functions():
+                # Add auto_control_deps in V2 when they are not already added by
+                # a `tf.function`.
+                if (ops.executing_eagerly_outside_functions() and
+                    not base_layer_utils.is_in_eager_or_tf_function()):
                   with auto_control_deps.AutomaticControlDependencies() as acd:
                     outputs = call_fn(inputs, *args, **kwargs)
                     # Wrap Tensors in `outputs` in `tf.identity` to avoid
@@ -705,6 +701,16 @@ class Layer(module.Module):
   def dynamic(self):
     return self._dynamic
 
+  @property
+  def trainable(self):
+    return self._trainable
+
+  @trainable.setter
+  def trainable(self, value):
+    self._trainable = value
+    for layer in getattr(self, '_layers', []):
+      layer.trainable = value
+
   @property
   def activity_regularizer(self):
     """Optional regularizer function for the output of this layer."""
@@ -828,7 +834,7 @@ class Layer(module.Module):
     model = tf.keras.Model(inputs, outputs)
     # Actvity regularization.
     model.add_loss(tf.abs(tf.reduce_mean(x)))
-    ````
+    ```
 
     If this is not the case for your loss (if, for example, your loss references
     a `Variable` of one of the model's layers), you can wrap your loss in a
@@ -884,7 +890,9 @@ class Layer(module.Module):
         continue
       if not tensor_util.is_tensor(loss):
         loss = ops.convert_to_tensor(loss, dtype=backend.floatx())
-      if tf_utils.is_symbolic_tensor(loss):
+      # TF Functions should take the eager path.
+      if (tf_utils.is_symbolic_tensor(loss) and
+          not base_layer_utils.is_in_tf_function()):
         symbolic_losses.append(_tag_unconditional(loss))
       elif tensor_util.is_tensor(loss):
         eager_losses.append(_tag_unconditional(loss))
@@ -952,10 +960,11 @@ class Layer(module.Module):
           'We currently support only `mean` sample-wise metric aggregation. '
           'You provided aggregation=`%s`' % aggregation)
 
+    from_metric_obj = hasattr(value, '_metric_obj')
     is_symbolic = tf_utils.is_symbolic_tensor(value)
     call_context = base_layer_utils.is_in_call_context()
 
-    if name is None and (not is_symbolic or not hasattr(value, '_metric_obj')):
+    if name is None and not from_metric_obj:
       # Eg. `self.add_metric(math_ops.reduce_sum(x), aggregation='mean')`
       # In eager mode, we use metric name to lookup a metric. Without a name,
       # a new Mean metric wrapper will be created on every model/layer call.
@@ -972,9 +981,9 @@ class Layer(module.Module):
                        'name=\'mean_activation\', aggregation=\'mean\')`')
 
     if call_context:
-      if is_symbolic:
-        with backend.get_graph().as_default():
-          self._symbolic_add_metric(value, aggregation, name)
+      # TF Function path should take the eager path.
+      if is_symbolic and not base_layer_utils.is_in_tf_function():
+        self._symbolic_add_metric(value, aggregation, name)
       else:
         self._eager_add_metric(value, aggregation, name)
     else:
@@ -988,7 +997,7 @@ class Layer(module.Module):
           self._symbolic_add_metric(value, aggregation, name)
         return
 
-      if getattr(value, '_metric_obj', None):
+      if from_metric_obj:
         raise ValueError('Using the result of calling a `Metric` object '
                          'when calling `add_metric` on a Functional '
                          'Model is not supported. Please pass the '
@@ -2034,6 +2043,17 @@ class Layer(module.Module):
   def _is_layer(self):
     return True
 
+  @property
+  def _call_fn_args(self):
+    if getattr(self, '__call_fn_args', None) is None:
+      self.__call_fn_args = function_utils.fn_args(self.call)
+    return self.__call_fn_args
+
+  @property
+  def _should_compute_mask(self):
+    return ('mask' in self._call_fn_args or
+            getattr(self, 'compute_mask', None) is not None)
+
 
 class Node(object):
   """A `Node` describes the connectivity between two layers.
diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py
index b8a3f9a5eed..8083dc53b5f 100644
--- a/tensorflow/python/keras/engine/base_layer_test.py
+++ b/tensorflow/python/keras/engine/base_layer_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import collections
 import itertools as it
+import os
 import sys
 import traceback
 from absl.testing import parameterized
@@ -41,9 +42,12 @@ from tensorflow.python.layers import core as legacy_core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
+from tensorflow.python.summary import summary_iterator
 
 
 class DynamicLayer(base_layer.Layer):
@@ -501,6 +505,37 @@ class SymbolicSupportTest(test.TestCase):
       function_name = last_entry[2]
       self.assertEqual(function_name, 'easily_identifiable_name')
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_summaries_in_tf_function(self):
+    if not context.executing_eagerly():
+      return
+
+    class MyLayer(keras.layers.Layer):
+
+      def call(self, inputs):
+        summary_ops_v2.scalar('mean', math_ops.reduce_mean(inputs))
+        return inputs
+
+    tmp_dir = self.get_temp_dir()
+    writer = summary_ops_v2.create_file_writer_v2(tmp_dir)
+    with writer.as_default(), summary_ops_v2.always_record_summaries():
+      my_layer = MyLayer()
+      x = array_ops.ones((10, 10))
+
+      def my_fn(x):
+        return my_layer(x)
+
+      _ = my_fn(x)
+
+    event_file = gfile.Glob(os.path.join(tmp_dir, 'events*'))
+    self.assertLen(event_file, 1)
+    event_file = event_file[0]
+    tags = set()
+    for e in summary_iterator.summary_iterator(event_file):
+      for val in e.summary.value:
+        tags.add(val.tag)
+    self.assertEqual(set(['my_layer/mean']), tags)
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class NestedTrackingTest(test.TestCase):
diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py
index c9a27f7e82b..b419ca7341e 100644
--- a/tensorflow/python/keras/engine/base_layer_utils.py
+++ b/tensorflow/python/keras/engine/base_layer_utils.py
@@ -353,15 +353,21 @@ def is_in_keras_graph():
   """Returns if currently executing inside of a Keras graph."""
   # Returns True even if in a subgraph of the Keras graph, such as those
   # created by control flow ops.
+  if context.executing_eagerly():
+    return False
   return (getattr(backend.get_graph(), 'name', None) == 'keras_graph' or
           getattr(_call_context, 'in_keras_graph', False))
 
 
 def is_in_eager_or_tf_function():
   """Returns if in eager mode or inside of a tf.function."""
-  return (context.executing_eagerly() or
-          (ops.executing_eagerly_outside_functions() and
-           not is_in_keras_graph()))
+  return context.executing_eagerly() or is_in_tf_function()
+
+
+def is_in_tf_function():
+  """Returns if inside of a tf.function."""
+  return (ops.executing_eagerly_outside_functions() and
+          not context.executing_eagerly() and not is_in_keras_graph())
 
 
 def uses_keras_history(tensors):
@@ -446,7 +452,7 @@ def training_arg_passed_to_call(argspec, args, kwargs):
   # `argspec.args` starts with ['self', 'inputs']
   full_args = dict(zip(argspec.args[2:], args))
   full_args.update(kwargs)
-  return 'training' in full_args
+  return 'training' in full_args and full_args['training'] is not None
 
 
 def _get_var_read_dtype(input_list, should_cast):
@@ -490,7 +496,8 @@ def check_graph_consistency(tensor=None, method='add_loss', force_raise=False):
   We need to raise clear error messages in such cases.
 
   Arguments:
-    tensor: Tensor to check.
+    tensor: Tensor to check, or `False` if it is known that an error
+      should be raised.
     method: Caller method, one of {'add_metric', 'add_loss', 'add_update'}.
     force_raise: If an error should be raised regardless of `tensor`.
 
@@ -586,6 +593,12 @@ def mark_as_return(outputs, acd):
       return_tensor._keras_mask = acd.mark_as_return(tensor._keras_mask)
     else:
       return_tensor._keras_mask = None
+
+    # Handle TensorFlow Probability attached metadata.
+    # TODO(b/132076537): Remove this once TFP uses `CompositeTensor`.
+    if getattr(tensor, '_tfp_distribution', None) is not None:
+      return_tensor._tfp_distribution = tensor._tfp_distribution
+
     return return_tensor
     # pylint: enable=protected-access
 
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index a85c92ad1a7..1b9ac7d824f 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -35,6 +35,7 @@ from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend
+from tensorflow.python.keras import callbacks
 from tensorflow.python.keras import saving
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import base_layer_utils
@@ -191,9 +192,7 @@ class Network(base_layer.Layer):
     self._init_set_name(name, zero_based=True)
     self._activity_regularizer = None
     # This acts just like the `trainable` attribute of any layer instance.
-    # It does not affect users of the underlying layers, only users of the
-    # Network instance.
-    self.trainable = kwargs.get('trainable', True)
+    self._trainable = kwargs.get('trainable', True)
     # This attribute has no effect if the model is created using the Functional
     # API. Instead, `model.dynamic` is determined based on the internal layers.
     self._dynamic = kwargs.get('dynamic', False)
@@ -271,9 +270,6 @@ class Network(base_layer.Layer):
     self._base_init(name=name, **kwargs)
     self._validate_graph_inputs_and_outputs()
 
-    self._compute_previous_mask = (
-        'mask' in tf_inspect.getfullargspec(self.call).args or
-        hasattr(self, 'compute_mask'))
     # A Network does not create weights of its own, thus it is already
     # built.
     self.built = True
@@ -516,6 +512,10 @@ class Network(base_layer.Layer):
     weights += (self._trainable_weights + self._non_trainable_weights)
     return weights
 
+  @property
+  def _should_compute_mask(self):
+    return self._is_graph_network and super(Network, self)._should_compute_mask
+
   def compute_mask(self, inputs, mask):
     if not self._is_graph_network:
       return None
@@ -1306,6 +1306,11 @@ class Network(base_layer.Layer):
     if save_format == 'h5':
       with h5py.File(filepath, 'w') as f:
         saving.save_weights_to_hdf5_group(f, self.layers)
+        # TODO(rchao): Save this attribute in a decoupled checkpoint file
+        # that is solely for the purpose of fault tolerance.
+        if self._ckpt_saved_epoch is not None:
+          f.attrs[callbacks.CKPT_SAVED_EPOCH] = str(
+              self._ckpt_saved_epoch).encode('utf8')
     else:
       if context.executing_eagerly():
         session = None
@@ -1405,6 +1410,12 @@ class Network(base_layer.Layer):
     with h5py.File(filepath, 'r') as f:
       if 'layer_names' not in f.attrs and 'model_weights' in f:
         f = f['model_weights']
+      # TODO(rchao): Load this attribute from a decoupled metadata+checkpoint
+      # file that is solely for the purpose of fault tolerance. Decide if we
+      # should use TF or HDF5 format for the metadata.
+      if callbacks.CKPT_SAVED_EPOCH in f.attrs:
+        self._ckpt_saved_epoch = f.attrs[callbacks.CKPT_SAVED_EPOCH].decode(
+            'utf8')
       if by_name:
         saving.load_weights_from_hdf5_group_by_name(f, self.layers)
       else:
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 2ae0592102a..78a9102d73e 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -135,6 +135,10 @@ class Model(network.Network):
 
     self._run_eagerly = None
 
+    # The epoch at which the checkpoint is saved. Used for fault-tolerance.
+    # See `_maybe_load_initial_epoch_from_ckpt()` for more information.
+    self._ckpt_saved_epoch = None
+
   def get_weights(self):
     """Retrieves the weights of the model.
 
@@ -220,10 +224,7 @@ class Model(network.Network):
             `optimizer`, `loss`, `metrics` or `sample_weight_mode`.
     """
     _keras_api_gauge.get_cell('compile').set(True)
-    run_eagerly = kwargs.pop('run_eagerly', None)
-
-    self._run_eagerly = run_eagerly
-    optimizer = optimizers.get(optimizer)
+    self._run_eagerly = kwargs.pop('run_eagerly', None)
 
     if distribute is not None:
       if tf2.enabled():
@@ -248,13 +249,13 @@ class Model(network.Network):
     # cloning is requested.
     # TODO(b/124517980, b/124377929): Remove this temporary undocumented way
     # of enabling the feature and graduate it to the main distributed code path.
-    self._cloning = kwargs.pop('cloning', True)
+    self._cloning = kwargs.pop('cloning', False)
 
-    self._validate_compile_param_for_distribution_strategy(run_eagerly,
+    self._validate_compile_param_for_distribution_strategy(self.run_eagerly,
                                                            sample_weight_mode,
                                                            target_tensors,
                                                            weighted_metrics)
-    self.optimizer = optimizer
+    self.optimizer = optimizers.get(optimizer)
     # We've disabled automatic dependency tracking for this method, but do want
     # to add a checkpoint dependency on the optimizer if it's trackable.
     if isinstance(self.optimizer, trackable.Trackable):
@@ -269,8 +270,10 @@ class Model(network.Network):
       raise ValueError(
           'target_tensors argument is not supported when '
           'running a model eagerly.')
-    self.target_tensors = target_tensors
-    self.targets = []
+
+    # _training_targets contains a list of _TrainingTarget object, which has all
+    # feedable and non feedable targets of the model and related metadata.
+    self._training_targets = []
 
     # Set tf.distribute.Strategy specific parameters.
     self._distributed_model_cache = {}
@@ -294,11 +297,6 @@ class Model(network.Network):
     self.loss_functions = training_utils.prepare_loss_functions(
         self.loss, self.output_names)
 
-    self._feed_output_names = []
-    self._feed_output_shapes = []
-    self._feed_loss_fns = []
-    self._feed_targets = []
-
     skip_target_indices = self._prepare_skip_target_indices()
     self._skip_target_weighing_indices = skip_target_indices[:]
 
@@ -308,9 +306,8 @@ class Model(network.Network):
 
     # Initialization for Eager mode execution.
     if self.run_eagerly:
-      self._compile_eagerly(metrics, optimizer, sample_weight_mode,
-                            skip_target_indices, target_tensors,
-                            weighted_metrics)
+      self._compile_eagerly(metrics, sample_weight_mode, skip_target_indices,
+                            target_tensors, weighted_metrics)
       return
 
     with K.get_graph().as_default():
@@ -319,32 +316,34 @@ class Model(network.Network):
 
       for i in range(len(self.outputs)):
         if i in skip_target_indices:
-          self.targets.append(None)
+          self._training_targets.append(_TrainingTarget(None))
         else:
-          shape = K.int_shape(self.outputs[i])
+          target = target_tensors[i]
           name = self.output_names[i]
-          if target_tensors not in (None, []):
-            target = target_tensors[i]
-          else:
-            target = None
-          if target is None or K.is_placeholder(target):
-            if target is None:
-              target_dtype = losses.LABEL_DTYPES_FOR_LOSSES.get(
-                  self.loss_functions[i],
-                  K.dtype(self.outputs[i]))
+          shape = K.int_shape(self.outputs[i])
+          loss_fn = self.loss_functions[i]
 
-              target = K.placeholder(
-                  ndim=len(shape),
-                  name=name + '_target',
-                  sparse=K.is_sparse(self.outputs[i]),
-                  dtype=target_dtype)
-            self._feed_targets.append(target)
-            self._feed_output_names.append(name)
-            self._feed_output_shapes.append(shape)
-            self._feed_loss_fns.append(self.loss_functions[i])
-          else:
+          if target is not None and not K.is_placeholder(target):
             self._skip_target_weighing_indices.append(i)
-          self.targets.append(target)
+            feedable = False
+          else:
+            feedable = True
+
+          if target is None:
+            target_dtype = losses.LABEL_DTYPES_FOR_LOSSES.get(
+                loss_fn,
+                K.dtype(self.outputs[i]))
+
+            target = K.placeholder(
+                ndim=len(shape),
+                name=name + '_target',
+                sparse=K.is_sparse(self.outputs[i]),
+                dtype=target_dtype)
+
+          training_target = _TrainingTarget(
+              target, name=name, shape=shape, feedable=feedable,
+              loss_fn=loss_fn)
+          self._training_targets.append(training_target)
 
       # Save all metric attributes per output of the model.
       self._cache_output_metric_attributes(metrics, weighted_metrics)
@@ -356,7 +355,7 @@ class Model(network.Network):
       self._handle_metrics(
           self.outputs,
           masks=self._prepare_output_masks(),
-          targets=self.targets,
+          targets=self._targets,
           skip_target_indices=skip_target_indices)
 
       # Prepare sample weight modes. List with the same length as model outputs.
@@ -377,8 +376,7 @@ class Model(network.Network):
       self.predict_function = None
 
       # Collected trainable weights, sorted in topological order.
-      trainable_weights = self.trainable_weights
-      self._collected_trainable_weights = trainable_weights
+      self._collected_trainable_weights = self.trainable_weights
 
       # Validate all variables were correctly created in distribution scope.
       if self._distribution_strategy and not self._compile_distribution:
@@ -1615,12 +1613,14 @@ class Model(network.Network):
               'The model has %s outputs, but you passed target_tensors=%s' %
               (len(self.outputs), target_tensors))
       elif isinstance(target_tensors, dict):
-        for name in target_tensors:
-          if name not in self.output_names:
-            raise ValueError(
-                'Unknown entry in `target_tensors` dictionary: "{name}". '
-                'Only expected the following keys: {keys}'.format(
-                    name=name, keys=str(self.output_names)))
+        unexpected_target_tensor_names = set(target_tensors.keys()).difference(
+            self.output_names)
+        if unexpected_target_tensor_names:
+          raise ValueError(
+              'Unknown entry in `target_tensors` dictionary: "{name}". '
+              'Only expected the following keys: {keys}'.format(
+                  name=unexpected_target_tensor_names,
+                  keys=str(self.output_names)))
         tmp_target_tensors = []
         for name in self.output_names:
           tmp_target_tensors.append(target_tensors.get(name, None))
@@ -1630,11 +1630,16 @@ class Model(network.Network):
       else:
         raise TypeError('Expected `target_tensors` to be a list or tuple or '
                         'dict or a single tensor, but got:', target_tensors)
+    else:
+      # In case target tensor is empty or None, create a list with Nones
+      # that has same length as self.output_names. With that, the None check of
+      # target tensor can be skipped downstream.
+      target_tensors = [None for _ in self.output_names]
     return target_tensors
 
-  def _compile_eagerly(self, metrics, optimizer, sample_weight_mode,
+  def _compile_eagerly(self, metrics, sample_weight_mode,
                        skip_target_indices, target_tensors, weighted_metrics):
-    if isinstance(optimizer, loss_scale_optimizer.LossScaleOptimizer):
+    if isinstance(self.optimizer, loss_scale_optimizer.LossScaleOptimizer):
       # TODO(reedwm): Support this.
       raise ValueError('We currently do not support enabling `run_eagerly` '
                        'with a LossScaleOptimizer.')
@@ -1653,7 +1658,8 @@ class Model(network.Network):
     # Set metric attributes on model.
     self._set_metric_attributes(skip_target_indices=skip_target_indices)
     for i in range(len(self.outputs)):
-      self._feed_output_names.append(self.output_names[i])
+      self._training_targets.append(
+          _TrainingTarget(None, self.output_names[i], None, True, None))
     self._collected_trainable_weights = self.trainable_weights
 
   def _update_sample_weight_modes(self, sample_weights=None):
@@ -1719,7 +1725,7 @@ class Model(network.Network):
       self._handle_metrics(
           self.outputs,
           masks=masks,
-          targets=self.targets,
+          targets=self._targets,
           skip_target_indices=skip_target_indices,
           sample_weights=self.sample_weights,
           return_weighted_metrics=True)
@@ -1768,7 +1774,7 @@ class Model(network.Network):
     skip_target_indices = skip_target_indices or []
     total_loss = None
     with K.name_scope('loss'):
-      zipped_inputs = zip(self.targets, self.outputs, self.loss_functions,
+      zipped_inputs = zip(self._targets, self.outputs, self.loss_functions,
                           self.sample_weights, masks, self.loss_weights_list)
       for i, (y_true, y_pred, loss_fn, sample_weight, mask,
               loss_weight) in enumerate(zipped_inputs):
@@ -1788,7 +1794,6 @@ class Model(network.Network):
                       mask, None, sample_weight))
               sample_weight *= mask
 
-          weighted_losses = None
           if hasattr(loss_fn, 'reduction'):
             per_sample_losses = loss_fn.call(y_true, y_pred)
             weighted_losses = losses_utils.compute_weighted_loss(
@@ -1819,19 +1824,13 @@ class Model(network.Network):
             output_loss = losses_utils.scale_loss_for_distribution(output_loss)
 
         if len(self.outputs) > 1:
-          # Keep track of stateful result tensor and function for the loss.
-          # Compute the stateful loss value.
-          if weighted_losses is not None:
-            # TODO(b/120571621): Directly call metric when the bug is fixed.
-            aggregated_output_loss = (
-                distributed_training_utils.call_replica_local_fn(
-                    self._output_loss_metrics[i],
-                    weighted_losses,
-                    strategy=self._distribution_strategy))
-          else:
-            # Custom loss class.
-            aggregated_output_loss = self._call_metric_fn(
-                self._output_loss_metrics[i], y_true, y_pred, sample_weight)
+          # Keep track of stateful result tensor for the loss.
+          # TODO(b/120571621): Directly call metric when the bug is fixed.
+          aggregated_output_loss = (
+              distributed_training_utils.call_replica_local_fn(
+                  self._output_loss_metrics[i],
+                  output_loss,
+                  strategy=self._distribution_strategy))
           self._compile_metrics_tensors[loss_name] = aggregated_output_loss
 
         if total_loss is None:
@@ -2111,12 +2110,12 @@ class Model(network.Network):
           self._set_per_output_metric_attributes(
               self._per_output_weighted_metrics[i], i))
 
-    # Create a metric wrapper for each output loss.
+    # Create a metric wrapper for each output loss. This computes mean of an
+    # output loss across mini-batches (irrespective of how we reduce within a
+    # batch).
     if len(self.outputs) > 1:
       self._output_loss_metrics = [
-          metrics_module.SumOverBatchSize() if hasattr(loss_fn, 'reduction')
-          else metrics_module.SumOverBatchSizeMetricWrapper(loss_fn)
-          for loss_fn in self.loss_functions
+          metrics_module.Mean() for _ in self.loss_functions
       ]
 
     self._per_output_metrics = updated_per_output_metrics
@@ -2894,6 +2893,53 @@ class Model(network.Network):
     self.output_names = training_utils.generic_output_names(outputs)
     self.built = True
 
+  @property
+  def _targets(self):
+    """The output target tensors for the model."""
+    return [t.target for t in self._training_targets]
+
+  @property
+  def _feed_targets(self):
+    return [t.target for t in self._training_targets if t.feedable]
+
+  @property
+  def _feed_output_names(self):
+    return [t.name for t in self._training_targets if t.feedable]
+
+  @property
+  def _feed_output_shapes(self):
+    return [t.shape for t in self._training_targets if t.feedable]
+
+  @property
+  def _feed_loss_fns(self):
+    return [t.loss_fn for t in self._training_targets if t.feedable]
+
+  def _maybe_load_initial_epoch_from_ckpt(self, initial_epoch, mode):
+    """Maybe load initial epoch from ckpt considering possible worker recovery.
+
+    When `_ckpt_saved_epoch` attribute is not None in a `Model` object at the
+    time the training starts, this is under multi-worker training setting and
+    indicates the worker is recovering from previous failure. In this case,
+    infer `initial_epoch` from `self._ckpt_saved_epoch` to continue previous
+    unfinished training from certain epoch.
+
+    Arguments:
+      initial_epoch: The original initial_epoch user passes in in `fit()`.
+      mode: The training mode.
+
+    Returns:
+      If the training is recovering from previous failure under multi-worker
+      training setting, return the epoch the training is supposed to continue
+      at. Otherwise, return the `initial_epoch` the user passes in.
+    """
+    # TODO(rchao): Add recovery for validation case
+    # (when mode == ModeKeys.TEST).
+    if mode == ModeKeys.TRAIN and self._ckpt_saved_epoch is not None:
+      # The most recently saved epoch is one epoch prior to the epoch it failed
+      # at, so return '_ckpt_saved_epoch' plus one.
+      return int(self._ckpt_saved_epoch) + 1
+    return initial_epoch
+
 
 class DistributedCallbackModel(Model):
   """Model that is used for callbacks with tf.distribute.Strategy."""
@@ -2935,5 +2981,54 @@ class DistributedCallbackModel(Model):
     return super(DistributedCallbackModel, self).__getattr__(item)
 
 
+class _TrainingTarget(object):
+  """Container for a target tensor and its metadata (shape, loss...).
+
+  Arguments:
+    target: A target tensor for the model. It may be `None` if the
+      output is excluded from loss computation. It is still kept as None
+      since each output of the model should have a corresponding target. If
+      the target is None, the rest of the attributes will be None as well.
+    name: String, the name of the target tensor.
+    shape: The shape of the target tensor.
+    feedable: Boolean, whether the target is feedable (requires data to be
+      passed in `fit` or `train_on_batch`), or not (model compiled with
+      `target_tensors` argument).
+    loss_fn: The loss function corresponding to this target. May be `None`.
+  """
+
+  def __init__(self,
+               target,
+               name=None,
+               shape=None,
+               feedable=False,
+               loss_fn=None):
+    self._target = target
+    self._name = name
+    self._shape = shape
+    self._feedable = feedable
+    self._loss_fn = loss_fn
+
+  @property
+  def target(self):
+    return self._target
+
+  @property
+  def name(self):
+    return self._name
+
+  @property
+  def shape(self):
+    return self._shape
+
+  @property
+  def feedable(self):
+    return self._feedable
+
+  @property
+  def loss_fn(self):
+    return self._loss_fn
+
+
 def _is_symbolic_tensor(x):
   return tensor_util.is_tensor(x) and not isinstance(x, ops.EagerTensor)
diff --git a/tensorflow/python/keras/engine/training_arrays.py b/tensorflow/python/keras/engine/training_arrays.py
index e3960a320af..ab890a10dad 100644
--- a/tensorflow/python/keras/engine/training_arrays.py
+++ b/tensorflow/python/keras/engine/training_arrays.py
@@ -235,6 +235,8 @@ def model_iteration(model,
   callbacks._call_begin_hook(mode)
   progbar.on_train_begin()
 
+  initial_epoch = model._maybe_load_initial_epoch_from_ckpt(initial_epoch, mode)
+
   for epoch in range(initial_epoch, epochs):
     if callbacks.model.stop_training:
       break
diff --git a/tensorflow/python/keras/engine/training_distributed.py b/tensorflow/python/keras/engine/training_distributed.py
index c38e9a66513..b12596992db 100644
--- a/tensorflow/python/keras/engine/training_distributed.py
+++ b/tensorflow/python/keras/engine/training_distributed.py
@@ -393,6 +393,9 @@ def experimental_tpu_fit_loop(model,
   target_steps = len(steps_to_run)
 
   callbacks._call_begin_hook(mode)
+
+  initial_epoch = model._maybe_load_initial_epoch_from_ckpt(initial_epoch, mode)
+
   for epoch in range(initial_epoch, epochs):
     distributed_training_utils._reset_metrics(model)
     callbacks.on_epoch_begin(epoch)
diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py
index d45710f4bed..b2fbdffc029 100644
--- a/tensorflow/python/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/engine/training_eager.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import training_utils
+from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer
 from tensorflow.python.keras.utils import losses_utils
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
@@ -179,15 +180,8 @@ def _model_loss(model,
       # associated with a model, each output's loss is calculated and returned
       # as part of the loss_metrics.
       if len(model.outputs) > 1:
-        # Compute the stateful loss value.
-        if weighted_losses is not None:
-          aggregated_output_loss = output_loss_metrics[i](weighted_losses)
-        else:
-          # Custom loss class.
-          aggregated_output_loss = training_utils.call_metric_function(
-              output_loss_metrics[i], targets[i], outs[i], weights=weights)
         # Keep track of the stateful output loss result.
-        output_losses.append(aggregated_output_loss)
+        output_losses.append(output_loss_metrics[i](output_loss))
 
       total_loss += model.loss_weights_list[i] * output_loss
 
@@ -241,13 +235,24 @@ def _process_single_batch(model,
       if total_loss is None:
         raise ValueError('The model cannot be run '
                          'because it has no loss to optimize.')
+      if isinstance(model.optimizer, loss_scale_optimizer.LossScaleOptimizer):
+        # TODO(reedwm): Make loss_scale public instead of accessing private
+        # _loss_scale attribute.
+        loss_scale = model.optimizer._loss_scale()
+        scaled_total_loss = loss_scale_optimizer.scale_loss(total_loss,
+                                                            loss_scale)
+      else:
+        loss_scale = None
+        scaled_total_loss = total_loss
     if training:
       if not model.trainable_weights:
         logging.warning('The list of trainable weights is empty. Make sure that'
                         ' you are not setting model.trainable to False before '
                         'compiling the model.')
       else:
-        grads = tape.gradient(total_loss, model.trainable_weights)
+        grads = tape.gradient(scaled_total_loss, model.trainable_weights)
+        if loss_scale is not None:
+          grads = loss_scale_optimizer.unscale_grads(grads, loss_scale)
         model.optimizer.apply_gradients(zip(grads,
                                             model.trainable_weights))
     return outs, total_loss, output_losses, masks
@@ -273,12 +278,13 @@ def train_on_batch(model,
   """
   if isinstance(inputs, collections.Sequence):
     if len(inputs) and tensor_util.is_tensor(inputs[0]):
-      inputs = training_utils.cast_if_floating_dtype(inputs)
+      inputs = training_utils.cast_if_floating_to_model_input_dtypes(inputs,
+                                                                     model)
       if targets:
         targets = training_utils.cast_if_floating_dtype(targets)
     else:
-      inputs = training_utils.cast_if_floating_dtype(
-          [ops.convert_to_tensor(val) for val in inputs])
+      inputs = training_utils.cast_if_floating_to_model_input_dtypes(
+          [ops.convert_to_tensor(val) for val in inputs], model)
       if targets:
         targets = training_utils.cast_if_floating_dtype(
             [ops.convert_to_tensor(val) for val in targets])
@@ -331,11 +337,12 @@ def test_on_batch(model,
   """
   if isinstance(inputs, collections.Sequence):
     if len(inputs) and tensor_util.is_tensor(inputs[0]):
-      inputs = training_utils.cast_if_floating_dtype(inputs)
+      inputs = training_utils.cast_if_floating_to_model_input_dtypes(inputs,
+                                                                     model)
       targets = training_utils.cast_if_floating_dtype(targets)
     else:
-      inputs = training_utils.cast_if_floating_dtype(
-          [ops.convert_to_tensor(val) for val in inputs])
+      inputs = training_utils.cast_if_floating_to_model_input_dtypes(
+          [ops.convert_to_tensor(val) for val in inputs], model)
       targets = training_utils.cast_if_floating_dtype(
           [ops.convert_to_tensor(val) for val in targets])
   if sample_weights:
diff --git a/tensorflow/python/keras/engine/training_generator.py b/tensorflow/python/keras/engine/training_generator.py
index 3c91790e686..9e9c60a35ed 100644
--- a/tensorflow/python/keras/engine/training_generator.py
+++ b/tensorflow/python/keras/engine/training_generator.py
@@ -190,6 +190,9 @@ def model_iteration(model,
   callbacks.model.stop_training = False
   callbacks._call_begin_hook(mode)
   progbar.on_train_begin()
+
+  initial_epoch = model._maybe_load_initial_epoch_from_ckpt(initial_epoch, mode)
+
   for epoch in range(initial_epoch, epochs):
     if callbacks.model.stop_training:
       break
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 732270c7c4d..173d1949db4 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -1047,15 +1047,17 @@ class TrainingTest(keras_parameterized.TestCase):
     model.add_loss(2 * math_ops.reduce_mean(
         keras.losses.mean_absolute_error(targets, outputs)))
 
+    model.add_loss(keras.losses.MeanAbsoluteError()(targets, outputs))
+
     model.compile(
-        keras.optimizer_v2.gradient_descent.SGD(0.033333),
+        keras.optimizer_v2.gradient_descent.SGD(0.025),
         loss=keras.losses.MeanAbsoluteError(),
         run_eagerly=testing_utils.should_run_eagerly())
 
     x = np.array([[0.], [1.], [2.]])
     y = np.array([[0.5], [2.], [3.5]])
     history = model.fit([x, y], y, batch_size=3, epochs=5)
-    self.assertAllClose(history.history['loss'], [3., 2.7, 2.4, 2.1, 1.8], 1e-3)
+    self.assertAllClose(history.history['loss'], [4., 3.6, 3.2, 2.8, 2.4], 1e-3)
 
   @keras_parameterized.run_all_keras_modes
   def test_unconditional_add_loss_correctness(self):
@@ -1128,6 +1130,72 @@ class TrainingTest(keras_parameterized.TestCase):
 
     self.assertLen(model.trainable_variables, 3)
 
+  # TODO(b/131372221): Make this work with subclassed models.
+  @keras_parameterized.run_with_all_model_types(exclude_models=['subclass'])
+  @keras_parameterized.run_all_keras_modes
+  def test_model_dtype(self):
+
+    class AssertTypeLayer(keras.layers.Layer):
+
+      def __init__(self, assert_type=None, **kwargs):
+        super(AssertTypeLayer, self).__init__(**kwargs)
+        self.assert_type = assert_type
+
+      def call(self, inputs):
+        assert inputs.dtype.name == self.assert_type, (
+            'Input tensor has type %s which does not match assert type %s' %
+            (inputs.dtype.name, self.assert_type))
+        return inputs + 1.
+
+    for dtype in ('float16', 'float32', 'float64'):
+      model = testing_utils.get_model_from_layers([AssertTypeLayer(dtype)],
+                                                  input_shape=(10,),
+                                                  input_dtype=dtype)
+      model.compile('sgd', 'mse',
+                    run_eagerly=testing_utils.should_run_eagerly())
+
+      x = np.ones((10, 10), dtype=dtype)
+      y = np.ones((10, 10), dtype=dtype)
+      model.fit(x, y)
+      model.test_on_batch(x, y)
+      model(x)
+
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_subclassed_model_with_training_arg(self):
+
+    class LayerWithTrainingArg(keras.layers.Layer):
+
+      def call(self, inputs, training=None):
+        self.training = training
+        return inputs
+
+    class ModelWithTrainingArg(keras.Model):
+
+      def __init__(self):
+        super(ModelWithTrainingArg, self).__init__()
+        self.l1 = LayerWithTrainingArg()
+
+      def call(self, inputs, training=None):
+        self.training = training
+        inputs = self.l1(inputs, training=training)
+        return inputs
+
+    x = np.zeros((1, 2))
+    model = ModelWithTrainingArg()
+    model.compile(
+        loss='mse',
+        optimizer='sgd',
+        run_eagerly=testing_utils.should_run_eagerly())
+    model.fit(x, x, epochs=1)
+
+    if testing_utils.should_run_eagerly():
+      expected_training_arg = True
+    else:
+      expected_training_arg = keras.backend.symbolic_learning_phase()
+
+    self.assertEqual(model.training, expected_training_arg)
+    self.assertEqual(model.l1.training, expected_training_arg)
+
 
 class TestExceptionsAndWarnings(keras_parameterized.TestCase):
 
@@ -2578,6 +2646,14 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     model = keras.models.Model(x, y)
     model.add_metric(
         math_ops.reduce_sum(y), name='metric_1', aggregation='mean')
+
+    if context.executing_eagerly():
+      # This is not a use case in v1 graph mode.
+      mean_result = metrics_module.Mean()(y)
+      with self.assertRaisesRegex(
+          ValueError, 'Expected a symbolic Tensor for the metric value'):
+        model.add_metric(mean_result, name='metric_2')
+
     with self.assertRaisesRegex(
         ValueError, 'Using the result of calling a `Metric` object '):
       with keras.backend.get_graph().as_default():
@@ -2706,6 +2782,13 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     model.add_metric(
         math_ops.reduce_sum(y), name='metric_3', aggregation='mean')
 
+    if context.executing_eagerly():
+      # This is not a use case in v1 graph mode.
+      mean_result = metrics_module.Mean()(y)
+      with self.assertRaisesRegex(
+          ValueError, 'Expected a symbolic Tensor for the metric value'):
+        model.add_metric(mean_result, name='metric_4')
+
     with self.assertRaisesRegex(
         ValueError, 'Using the result of calling a `Metric` object '):
       with keras.backend.get_graph().as_default():
diff --git a/tensorflow/python/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py
index 7929e2b40a1..bd17ad04490 100644
--- a/tensorflow/python/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/engine/training_utils.py
@@ -993,10 +993,11 @@ def check_steps_argument(input_data, steps, steps_name):
   return False
 
 
-def cast_single_tensor(x):
+def cast_single_tensor(x, dtype=None):
   x = ops.convert_to_tensor(x)
+  dtype = dtype or K.floatx()
   if x.dtype.is_floating:
-    return math_ops.cast(x, dtype=K.floatx())
+    return math_ops.cast(x, dtype=dtype)
   return x
 
 
@@ -1013,6 +1014,25 @@ def cast_if_floating_dtype(x):
   return nest.map_structure(cast_single_tensor, x)
 
 
+def cast_if_floating_to_model_input_dtypes(x, model):
+  """Casts the given data tensors to the dtypes of the model inputs.
+
+  Casts only if the input is already a floating point type.
+
+  Args:
+    x: tensor or list/tuple of tensors.
+    model: The model.
+
+  Returns:
+    Converted input. Each tensor is casted to the corresponding input in
+    `model.inputs`.
+  """
+  # TODO(b/131372221): We should probably cast even if the input is not
+  # floating-point.
+  input_dtypes = nest.map_structure(lambda t: t.dtype, model.inputs)
+  return nest.map_structure(cast_single_tensor, x, input_dtypes)
+
+
 def get_output_sample_weight(skip_target_weighing_indices, sample_weight_mode,
                              output_name, output_index):
   """Returns the sample weight and weight mode for a single output."""
diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index f1d57e3bd5d..2f8b13cb459 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -26,6 +26,7 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import tf_utils
@@ -126,6 +127,8 @@ class BatchNormalizationBase(Layer):
   References:
     - [Batch Normalization: Accelerating Deep Network Training by Reducing
       Internal Covariate Shift](https://arxiv.org/abs/1502.03167)
+
+  {{TRAINABLE_ATTRIBUTE_NOTE}}
   """
 
   # By default, the base class uses V2 behavior. The BatchNormalization V1
@@ -156,7 +159,7 @@ class BatchNormalizationBase(Layer):
                name=None,
                **kwargs):
     super(BatchNormalizationBase, self).__init__(
-        name=name, trainable=trainable, **kwargs)
+        name=name, **kwargs)
     if isinstance(axis, list):
       self.axis = axis[:]
     elif isinstance(axis, int):
@@ -193,6 +196,8 @@ class BatchNormalizationBase(Layer):
 
     self.fused = fused
     self._bessels_correction_test_only = True
+    self._trainable_var = None
+    self.trainable = trainable
 
     if renorm:
       renorm_clipping = renorm_clipping or {}
@@ -235,6 +240,22 @@ class BatchNormalizationBase(Layer):
     except ValueError:
       return False
 
+  @property
+  def trainable(self):
+    return self._trainable
+
+  @trainable.setter
+  def trainable(self, value):
+    self._trainable = value
+    if self._trainable_var is not None:
+      self._trainable_var.update_value(value)
+
+  def _get_trainable_var(self):
+    if self._trainable_var is None:
+      self._trainable_var = K.freezable_variable(
+          self._trainable, name=self.name + '_trainable')
+    return self._trainable_var
+
   @property
   def _param_dtype(self):
     # Raise parameters of fp16 batch norm to fp32
@@ -598,9 +619,20 @@ class BatchNormalizationBase(Layer):
                                  K.zeros_like(variance))
     return mean, variance
 
-  def call(self, inputs, training=None):
+  def _get_training_value(self, training=None):
     if training is None:
       training = K.learning_phase()
+    if self._USE_V2_BEHAVIOR:
+      if isinstance(training, int):
+        training = bool(training)
+      if base_layer_utils.is_in_keras_graph():
+        training = math_ops.logical_and(training, self._get_trainable_var())
+      else:
+        training = math_ops.logical_and(training, self.trainable)
+    return training
+
+  def call(self, inputs, training=None):
+    training = self._get_training_value(training)
 
     if self.virtual_batch_size is not None:
       # Virtual batches (aka ghost batches) can be simulated by reshaping the
@@ -813,27 +845,27 @@ class BatchNormalizationBase(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-def _replace_in_base_docstring(old, new):
+def replace_in_base_docstring(replacements):
   string = BatchNormalizationBase.__doc__
-  if old not in string:
-    raise ValueError('Could not find following string in BatchNormalizationBase'
-                     ' docstring: "{}"'.format(old))
-  return string.replace(old, new)
+  for old, new in replacements:
+    assert old in string
+    string.replace(old, new)
+  return string
 
 
 @keras_export(v1=['keras.layers.BatchNormalization'])  # pylint: disable=missing-docstring
 class BatchNormalization(BatchNormalizationBase):
 
-  __doc__ = _replace_in_base_docstring(
-      '''
+  __doc__ = replace_in_base_docstring(
+      [('''
     fused: if `True`, use a faster, fused implementation, or raise a ValueError
       if the fused implementation cannot be used. If `None`, use the faster
       implementation if possible. If False, do not used the fused
       implementation.''',
-
-      '''
+        '''
     fused: if `None` or `True`, use a faster, fused implementation if possible.
-      If `False`, use the system recommended implementation.''')
+      If `False`, use the system recommended implementation.'''),
+       ('{{TRAINABLE_ATTRIBUTE_NOTE}}', '')])
 
   _USE_V2_BEHAVIOR = False
 
diff --git a/tensorflow/python/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py
index 4afc153c243..1f5c4a2fafd 100644
--- a/tensorflow/python/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/layers/normalization_test.py
@@ -22,12 +22,16 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.layers import normalization
 from tensorflow.python.keras.layers import normalization_v2
 from tensorflow.python.keras.mixed_precision.experimental import policy
+from tensorflow.python.keras.optimizer_v2 import rmsprop as rmsprop_v2
 from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent
 
@@ -154,6 +158,61 @@ class BatchNormalizationTest(keras_parameterized.TestCase):
     self.assertEqual(norm.beta.dtype.base_dtype, 'float32')
     self.assertEqual(norm.gamma.dtype.base_dtype, 'float32')
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_batchnorm_non_trainable_with_fit(self):
+    inputs = keras.Input((3,))
+    bn = normalization_v2.BatchNormalization()
+    outputs = bn(inputs)
+    model = keras.Model(inputs, outputs)
+    model.compile('rmsprop', 'mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
+    model.fit(np.random.random((100, 3)), np.random.random((100, 3)))
+
+    test_data = np.random.random((10, 3))
+    test_targets = np.random.random((10, 3))
+    test_loss = model.evaluate(test_data, test_targets)
+
+    bn.trainable = False
+    model.compile('rmsprop', 'mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
+    train_loss = model.train_on_batch(test_data, test_targets)
+    self.assertAlmostEqual(test_loss, train_loss)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_batchnorm_non_trainable_with_tf_function(self):
+    inputs = keras.Input((3,))
+    bn = normalization_v2.BatchNormalization()
+    outputs = bn(inputs)
+    model = keras.Model(inputs, outputs)
+    loss_fn = keras.losses.MeanSquaredError()
+    optimizer = rmsprop_v2.RMSprop()
+
+    @def_function.function()
+    def train_step(x, y):
+      with backprop.GradientTape() as tape:
+        y_pred = model(x, training=True)
+        loss = loss_fn(y, y_pred)
+      grads = tape.gradient(loss, model.trainable_weights)
+      optimizer.apply_gradients(zip(grads, model.trainable_weights))
+      return loss
+
+    @def_function.function()
+    def test_step(x, y):
+      y_pred = model(x, training=False)
+      loss = loss_fn(y, y_pred)
+      return loss
+
+    train_step(np.random.random((100, 3)), np.random.random((100, 3)))
+
+    test_data = np.random.random((10, 3))
+    test_targets = np.random.random((10, 3))
+    test_loss = test_step(test_data, test_targets)
+
+    bn.trainable = False
+    train_loss = train_step(test_data, test_targets)
+    if context.executing_eagerly():
+      self.assertAlmostEqual(test_loss.numpy(), train_loss.numpy())
+
 
 class BatchNormalizationV1Test(test.TestCase):
 
diff --git a/tensorflow/python/keras/layers/normalization_v2.py b/tensorflow/python/keras/layers/normalization_v2.py
index 05501a7bf2c..6a1049e773f 100644
--- a/tensorflow/python/keras/layers/normalization_v2.py
+++ b/tensorflow/python/keras/layers/normalization_v2.py
@@ -18,11 +18,48 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras.layers.normalization import BatchNormalizationBase
+from tensorflow.python.keras.layers import normalization
 from tensorflow.python.util.tf_export import keras_export
 
 
 @keras_export('keras.layers.BatchNormalization', v1=[])  # pylint: disable=missing-docstring
-class BatchNormalization(BatchNormalizationBase):
+class BatchNormalization(normalization.BatchNormalizationBase):
+
+  __doc__ = normalization.replace_in_base_docstring([
+      ('{{TRAINABLE_ATTRIBUTE_NOTE}}',
+       '''
+  **About setting `layer.trainable = False` on a `BatchNormalization layer:**
+
+  The meaning of setting `layer.trainable = False` is to freeze the layer,
+  i.e. its internal state will not change during training:
+  its trainable weights will not be updated
+  during `fit()` or `train_on_batch()`, and its state updates will not be run.
+
+  Usually, this does not necessarily mean that the layer is run in inference
+  mode (which is normally controlled by the `training` argument that can
+  be passed when calling a layer). "Frozen state" and "inference mode"
+  are two separate concepts.
+
+  However, in the case of the `BatchNormalization` layer, **setting
+  `trainable = False` on the layer means that the layer will be
+  subsequently run in inference mode** (meaning that it will use
+  the moving mean and the moving variance to normalize the current batch,
+  rather than using the mean and variance of the current batch).
+
+  This behavior has been introduced in TensorFlow 2.0, in order
+  to enable `layer.trainable = False` to produce the most commonly
+  expected behavior in the convnet fine-tuning use case.
+
+  Note that:
+    - This behavior only occurs as of TensorFlow 2.0. In 1.*,
+      setting `layer.trainable = False` would freeze the layer but would
+      not switch it to inference mode.
+    - Setting `trainable` on an model containing other layers will
+      recursively set the `trainable` value of all inner layers.
+    - If the value of the `trainable`
+      attribute is changed after calling `compile()` on a model,
+      the new value doesn't take effect for this model
+      until `compile()` is called again.
+      ''')])
 
   _USE_V2_BEHAVIOR = True
diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index b88b42e445d..bcfad5613a4 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -27,9 +27,9 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.utils import losses_utils
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
-from tensorflow.python.keras.utils.tf_utils import is_tensor_or_variable
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
@@ -117,7 +117,9 @@ class Loss(object):
     # If we are wrapping a lambda function strip '<>' from the name as it is not
     # accepted in scope name.
     scope_name = 'lambda' if self.name == '<lambda>' else self.name
-    with K.name_scope(scope_name or self.__class__.__name__):
+    graph_ctx = tf_utils.graph_context_for_symbolic_tensors(
+        y_true, y_pred, sample_weight)
+    with K.name_scope(scope_name or self.__class__.__name__), graph_ctx:
       losses = self.call(y_true, y_pred)
       return losses_utils.compute_weighted_loss(
           losses, sample_weight, reduction=self._get_reduction())
@@ -215,7 +217,7 @@ class LossFunctionWrapper(Loss):
   def get_config(self):
     config = {}
     for k, v in six.iteritems(self._fn_kwargs):
-      config[k] = K.eval(v) if is_tensor_or_variable(v) else v
+      config[k] = K.eval(v) if tf_utils.is_tensor_or_variable(v) else v
     base_config = super(LossFunctionWrapper, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index 42ece0e1849..9c9ca564696 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -2753,8 +2753,8 @@ def sparse_categorical_accuracy(y_true, y_pred):
 
 @keras_export('keras.metrics.top_k_categorical_accuracy')
 def top_k_categorical_accuracy(y_true, y_pred, k=5):
-  return K.mean(
-      nn.in_top_k(y_pred, math_ops.argmax(y_true, axis=-1), k), axis=-1)
+  return math_ops.cast(
+      nn.in_top_k(y_pred, math_ops.argmax(y_true, axis=-1), k), K.floatx())
 
 
 @keras_export('keras.metrics.sparse_top_k_categorical_accuracy')
@@ -2766,7 +2766,8 @@ def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5):
       K.int_shape(y_true)) == len(K.int_shape(y_pred))):
     y_true = array_ops.squeeze(y_true, [-1])
 
-  return K.mean(nn.in_top_k(y_pred, math_ops.cast(y_true, 'int32'), k), axis=-1)
+  return math_ops.cast(
+      nn.in_top_k(y_pred, math_ops.cast(y_true, 'int32'), k), K.floatx())
 
 # Aliases
 
diff --git a/tensorflow/python/keras/metrics_correctness_test.py b/tensorflow/python/keras/metrics_correctness_test.py
index abef3c4d3f1..4f761bfec11 100644
--- a/tensorflow/python/keras/metrics_correctness_test.py
+++ b/tensorflow/python/keras/metrics_correctness_test.py
@@ -18,32 +18,58 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import tf2
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import layers
+from tensorflow.python.keras import losses
 from tensorflow.python.keras import metrics
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.ops.losses import loss_reduction
 from tensorflow.python.platform import test
 
 
+def get_multi_io_model():
+  inp_1 = layers.Input(shape=(1,), name='input_1')
+  inp_2 = layers.Input(shape=(1,), name='input_2')
+  x = layers.Dense(3, kernel_initializer='ones', trainable=False)
+  out_1 = layers.Dense(
+      1, kernel_initializer='ones', name='output_1', trainable=False)
+  out_2 = layers.Dense(
+      1, kernel_initializer='ones', name='output_2', trainable=False)
+
+  branch_a = [inp_1, x, out_1]
+  branch_b = [inp_2, x, out_2]
+  return testing_utils.get_multi_io_model(branch_a, branch_b)
+
+
+def custom_generator_multi_io():
+  batch_size = 2
+  num_samples = 4
+  inputs = np.asarray([[1.], [2.], [3.], [4.]])
+  targets = np.asarray([[2.], [4.], [6.], [8.]])
+  w1 = np.asarray([2., 3., 4., 5.])
+  w2 = np.asarray([3.5, 2.5, 1.5, 0.5])
+  i = 0
+  while True:
+    batch_index = i * batch_size % num_samples
+    i += 1
+    start = batch_index
+    end = start + batch_size
+    x = [inputs[start:end], inputs[start:end]]
+    y = [targets[start:end], targets[start:end]]
+    w = [w1[start:end], w2[start:end]]
+    yield x, y, w
+
+
 @keras_parameterized.run_with_all_model_types(exclude_models=['sequential'])
 @keras_parameterized.run_all_keras_modes
 class TestMetricsCorrectnessMultiIO(keras_parameterized.TestCase):
 
-  def _get_multi_io_model(self):
-    inp_1 = layers.Input(shape=(1,), name='input_1')
-    inp_2 = layers.Input(shape=(1,), name='input_2')
-    x = layers.Dense(3, kernel_initializer='ones', trainable=False)
-    out_1 = layers.Dense(
-        1, kernel_initializer='ones', name='output_1', trainable=False)
-    out_2 = layers.Dense(
-        1, kernel_initializer='ones', name='output_2', trainable=False)
-
-    branch_a = [inp_1, x, out_1]
-    branch_b = [inp_2, x, out_2]
-    model = testing_utils.get_multi_io_model(branch_a, branch_b)
+  def _get_compiled_multi_io_model(self):
+    model = get_multi_io_model()
     model.compile(
         optimizer='rmsprop',
         loss='mse',
@@ -135,7 +161,7 @@ class TestMetricsCorrectnessMultiIO(keras_parameterized.TestCase):
     self.expected_batch_result = [41.25, 32.5, 8.75, 7.5, 9.286, 7.5, 4.375]
 
   def test_fit(self):
-    model = self._get_multi_io_model()
+    model = self._get_compiled_multi_io_model()
     history = model.fit([self.x, self.x], [self.y, self.y],
                         sample_weight={
                             'output_1': self.weights_1,
@@ -148,7 +174,7 @@ class TestMetricsCorrectnessMultiIO(keras_parameterized.TestCase):
       self.assertAllClose(history.history[key], value, 1e-3)
 
   def test_eval(self):
-    model = self._get_multi_io_model()
+    model = self._get_compiled_multi_io_model()
     eval_result = model.evaluate([self.x, self.x], [self.y, self.y],
                                  batch_size=2,
                                  sample_weight={
@@ -167,7 +193,7 @@ class TestMetricsCorrectnessMultiIO(keras_parameterized.TestCase):
     self.assertAllClose(mse1, mse2, 1e-3)
 
   def test_train_on_batch(self):
-    model = self._get_multi_io_model()
+    model = self._get_compiled_multi_io_model()
     result = model.train_on_batch([self.x, self.x], [self.y, self.y],
                                   sample_weight={
                                       'output_1': self.weights_1,
@@ -176,7 +202,7 @@ class TestMetricsCorrectnessMultiIO(keras_parameterized.TestCase):
     self.assertAllClose(result, self.expected_batch_result, 1e-3)
 
   def test_test_on_batch(self):
-    model = self._get_multi_io_model()
+    model = self._get_compiled_multi_io_model()
     result = model.test_on_batch([self.x, self.x], [self.y, self.y],
                                  sample_weight={
                                      'output_1': self.weights_1,
@@ -185,15 +211,15 @@ class TestMetricsCorrectnessMultiIO(keras_parameterized.TestCase):
     self.assertAllClose(result, self.expected_batch_result, 1e-3)
 
   def test_fit_generator(self):
-    model = self._get_multi_io_model()
+    model = self._get_compiled_multi_io_model()
     history = model.fit_generator(
-        self._custom_generator(), steps_per_epoch=2, epochs=2)
+        custom_generator_multi_io(), steps_per_epoch=2, epochs=2)
     for key, value in self.expected_fit_result.items():
       self.assertAllClose(history.history[key], value, 1e-3)
 
   def test_eval_generator(self):
-    model = self._get_multi_io_model()
-    eval_result = model.evaluate_generator(self._custom_generator(), steps=2)
+    model = self._get_compiled_multi_io_model()
+    eval_result = model.evaluate_generator(custom_generator_multi_io(), steps=2)
     self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
 
 
@@ -318,5 +344,150 @@ class TestMetricsCorrectnessSingleIO(keras_parameterized.TestCase):
     self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
 
 
+@keras_parameterized.run_with_all_model_types(exclude_models=['sequential'])
+@keras_parameterized.run_all_keras_modes
+@parameterized.parameters([
+    loss_reduction.ReductionV2.SUM_OVER_BATCH_SIZE,
+    loss_reduction.ReductionV2.AUTO,
+    loss_reduction.ReductionV2.SUM
+])
+class TestOutputLossMetrics(keras_parameterized.TestCase):
+
+  def _get_compiled_multi_io_model(self, loss):
+    model = get_multi_io_model()
+    model.compile(
+        optimizer='rmsprop',
+        loss=loss,
+        run_eagerly=testing_utils.should_run_eagerly())
+    return model
+
+  def setUp(self):
+    super(TestOutputLossMetrics, self).setUp()
+    self.x = np.asarray([[1.], [2.], [3.], [4.]])
+    self.y = np.asarray([[2.], [4.], [6.], [8.]])
+    self.weights_1 = np.asarray([2., 3., 4., 5.])
+    self.weights_2 = np.asarray([3.5, 2.5, 1.5, 0.5])
+
+    # y_true = [[2.], [4.], [6.], [8.]], y_pred = [[3.], [6.], [9.], [12.]]
+
+    # Loss `output_1`:
+    #   Per-sample weighted losses
+    #   Batch 1 = [(3 - 2)^2 * 2, (6 - 4)^2 * 3)] = [2, 12]
+    #   Batch 2 = [((9 - 6)^2 * 4, (12 - 8)^2 * 5)] = [36, 80]
+
+    #   Result (reduction=SUM) = ((2 + 12) + (36 + 80))/2 = 65
+    #   Result (reduction=SUM_OVER_BATCH_SIZE/AUTO/NONE) = 130 / 4 = 32.5
+
+    # Loss `output_2`:
+    #   Per-sample weighted losses
+    #   Batch 1 = [(3 - 2)^2 * 3.5, (6 - 4)^2 * 2.5)] = [3.5, 10]
+    #   Batch 2 = [(9 - 6)^2 * 1.5, (12 - 8)^2 * 0.5)] = [13.5, 8]
+
+    #   Result (reduction=SUM) = ((3.5 + 10) + (13.5 + 8))/2 = 17.5
+    #   Result (reduction=SUM_OVER_BATCH_SIZE/AUTO/NONE) = 35 / 4 = 8.75
+
+    # When reduction is 'NONE' loss value that is passed to the optimizer will
+    # be vector loss but what is reported is a scalar, which is an average of
+    # all the values in all the batch vectors.
+
+    # Total loss = Output_loss_1 + Output_loss_2
+
+    sum_over_batch_size_fit_result = {
+        'loss': [41.25, 41.25],
+        'output_1_loss': [32.5, 32.5],
+        'output_2_loss': [8.75, 8.75],
+    }
+
+    self.expected_fit_result = {
+        loss_reduction.ReductionV2.NONE:
+            sum_over_batch_size_fit_result,
+        loss_reduction.ReductionV2.SUM: {
+            'loss': [82.5, 82.5],
+            'output_1_loss': [65, 65],
+            'output_2_loss': [17.5, 17.5],
+        },
+        loss_reduction.ReductionV2.AUTO:
+            sum_over_batch_size_fit_result,
+        loss_reduction.ReductionV2.SUM_OVER_BATCH_SIZE:
+            sum_over_batch_size_fit_result,
+    }
+
+    # In the order: 'loss', 'output_1_loss', 'output_2_loss',
+    self.expected_batch_result = {
+        loss_reduction.ReductionV2.NONE: [41.25, 32.5, 8.75],
+        loss_reduction.ReductionV2.SUM: [82.5, 65, 17.5],
+        loss_reduction.ReductionV2.AUTO: [41.25, 32.5, 8.75],
+        loss_reduction.ReductionV2.SUM_OVER_BATCH_SIZE: [41.25, 32.5, 8.75],
+    }
+
+  def test_fit(self, reduction):
+    model = self._get_compiled_multi_io_model(
+        loss=losses.MeanSquaredError(reduction=reduction))
+    history = model.fit([self.x, self.x], [self.y, self.y],
+                        sample_weight={
+                            'output_1': self.weights_1,
+                            'output_2': self.weights_2,
+                        },
+                        batch_size=2,
+                        epochs=2,
+                        shuffle=False)
+    for key, value in self.expected_fit_result[reduction].items():
+      self.assertAllClose(history.history[key], value)
+
+  def test_eval(self, reduction):
+    model = self._get_compiled_multi_io_model(
+        loss=losses.MeanSquaredError(reduction=reduction))
+    eval_result = model.evaluate([self.x, self.x], [self.y, self.y],
+                                 batch_size=2,
+                                 sample_weight={
+                                     'output_1': self.weights_1,
+                                     'output_2': self.weights_2,
+                                 })
+    self.assertAllClose(eval_result, self.expected_batch_result[reduction])
+
+  def test_train_on_batch(self, reduction):
+    model = self._get_compiled_multi_io_model(
+        loss=losses.MeanSquaredError(reduction=reduction))
+    result = model.train_on_batch([self.x, self.x], [self.y, self.y],
+                                  sample_weight={
+                                      'output_1': self.weights_1,
+                                      'output_2': self.weights_2,
+                                  })
+
+    expected_values = self.expected_batch_result[reduction]
+    if reduction == loss_reduction.ReductionV2.SUM:
+      # We are taking all the data as one batch, so undo the averaging here.
+      expected_values = [x * 2 for x in self.expected_batch_result[reduction]]
+    self.assertAllClose(result, expected_values)
+
+  def test_test_on_batch(self, reduction):
+    model = self._get_compiled_multi_io_model(
+        loss=losses.MeanSquaredError(reduction=reduction))
+    result = model.test_on_batch([self.x, self.x], [self.y, self.y],
+                                 sample_weight={
+                                     'output_1': self.weights_1,
+                                     'output_2': self.weights_2,
+                                 })
+    expected_values = self.expected_batch_result[reduction]
+    if reduction == loss_reduction.ReductionV2.SUM:
+      # We are taking all the data as one batch, so undo the averaging here.
+      expected_values = [x * 2 for x in self.expected_batch_result[reduction]]
+    self.assertAllClose(result, expected_values)
+
+  def test_fit_generator(self, reduction):
+    model = self._get_compiled_multi_io_model(
+        loss=losses.MeanSquaredError(reduction=reduction))
+    history = model.fit_generator(
+        custom_generator_multi_io(), steps_per_epoch=2, epochs=2)
+    for key, value in self.expected_fit_result[reduction].items():
+      self.assertAllClose(history.history[key], value)
+
+  def test_eval_generator(self, reduction):
+    model = self._get_compiled_multi_io_model(
+        loss=losses.MeanSquaredError(reduction=reduction))
+    eval_result = model.evaluate_generator(custom_generator_multi_io(), steps=2)
+    self.assertAllClose(eval_result, self.expected_batch_result[reduction])
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/metrics_functional_test.py b/tensorflow/python/keras/metrics_functional_test.py
index 513daaf9fcc..40478d28f24 100644
--- a/tensorflow/python/keras/metrics_functional_test.py
+++ b/tensorflow/python/keras/metrics_functional_test.py
@@ -85,37 +85,37 @@ class KerasFunctionalMetricsTest(test.TestCase):
       y_true = K.variable(np.array([[1], [0]]))
       result = K.eval(
           metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=3))
-      self.assertEqual(result, 1)
+      self.assertEqual(np.mean(result), 1)
       result = K.eval(
           metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=2))
-      self.assertEqual(result, 0.5)
+      self.assertEqual(np.mean(result), 0.5)
       result = K.eval(
           metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=1))
-      self.assertEqual(result, 0.)
+      self.assertEqual(np.mean(result), 0.)
 
       # Test correctness if the shape of y_true is (num_samples,)
       y_pred = K.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
       y_true = K.variable(np.array([1, 0]))
       result = K.eval(
           metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=3))
-      self.assertEqual(result, 1)
+      self.assertEqual(np.mean(result), 1)
       result = K.eval(
           metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=2))
-      self.assertEqual(result, 0.5)
+      self.assertEqual(np.mean(result), 0.5)
       result = K.eval(
           metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=1))
-      self.assertEqual(result, 0.)
+      self.assertEqual(np.mean(result), 0.)
 
   def test_top_k_categorical_accuracy(self):
     with self.cached_session():
       y_pred = K.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
       y_true = K.variable(np.array([[0, 1, 0], [1, 0, 0]]))
       result = K.eval(metrics.top_k_categorical_accuracy(y_true, y_pred, k=3))
-      self.assertEqual(result, 1)
+      self.assertEqual(np.mean(result), 1)
       result = K.eval(metrics.top_k_categorical_accuracy(y_true, y_pred, k=2))
-      self.assertEqual(result, 0.5)
+      self.assertEqual(np.mean(result), 0.5)
       result = K.eval(metrics.top_k_categorical_accuracy(y_true, y_pred, k=1))
-      self.assertEqual(result, 0.)
+      self.assertEqual(np.mean(result), 0.)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/metrics_test.py b/tensorflow/python/keras/metrics_test.py
index c8b3a35f4d0..b4f10259f2c 100644
--- a/tensorflow/python/keras/metrics_test.py
+++ b/tensorflow/python/keras/metrics_test.py
@@ -934,6 +934,15 @@ class TopKCategoricalAccuracyTest(test.TestCase):
     result = a_obj(y_true, y_pred)
     self.assertEqual(0.5, self.evaluate(result))  # only 1 sample matches.
 
+  def test_weighted(self):
+    a_obj = metrics.TopKCategoricalAccuracy(k=2)
+    self.evaluate(variables.variables_initializer(a_obj.variables))
+    y_true = constant_op.constant([[0, 1, 0], [1, 0, 0], [0, 0, 1]])
+    y_pred = constant_op.constant([[0, 0.9, 0.1], [0, 0.9, 0.1], [0, 0.9, 0.1]])
+    sample_weight = constant_op.constant((1.0, 0.0, 1.0))
+    result = a_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(1.0, self.evaluate(result), atol=1e-5)
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class SparseTopKCategoricalAccuracyTest(test.TestCase):
@@ -972,6 +981,15 @@ class SparseTopKCategoricalAccuracyTest(test.TestCase):
     result = a_obj(y_true, y_pred)
     self.assertEqual(0.5, self.evaluate(result))  # only 1 sample matches.
 
+  def test_weighted(self):
+    a_obj = metrics.SparseTopKCategoricalAccuracy(k=2)
+    self.evaluate(variables.variables_initializer(a_obj.variables))
+    y_true = constant_op.constant([1, 0, 2])
+    y_pred = constant_op.constant([[0, 0.9, 0.1], [0, 0.9, 0.1], [0, 0.9, 0.1]])
+    sample_weight = constant_op.constant((1.0, 0.0, 1.0))
+    result = a_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(1.0, self.evaluate(result), atol=1e-5)
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class LogCoshErrorTest(test.TestCase):
diff --git a/tensorflow/python/keras/mixed_precision/experimental/BUILD b/tensorflow/python/keras/mixed_precision/experimental/BUILD
index 628944c2b05..076227b59d1 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/BUILD
+++ b/tensorflow/python/keras/mixed_precision/experimental/BUILD
@@ -52,6 +52,7 @@ py_test(
     srcs = [
         "policy_test.py",
     ],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":policy",
@@ -78,6 +79,7 @@ py_test(
     name = "autocast_variable_test",
     size = "medium",
     srcs = ["autocast_variable_test.py"],
+    python_version = "PY2",
     deps = [
         ":autocast_variable",
         "//tensorflow/python:client_testlib",
@@ -128,6 +130,7 @@ py_test(
     name = "loss_scale_test",
     size = "medium",
     srcs = ["loss_scale_test.py"],
+    python_version = "PY2",
     deps = [
         ":loss_scale",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
index 8ff28fbf8b5..9c2d76e8efa 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
@@ -293,9 +293,14 @@ class KerasModelTest(test.TestCase, parameterized.TestCase):
       'testcase_name': 'regularizer',
       'strategy_fn': create_mirrored_strategy,
       'use_regularizer': True
+  }, {
+      'testcase_name': 'nocloning',
+      'strategy_fn': create_mirrored_strategy,
+      'cloning': False
   })
   @test_util.run_in_graph_and_eager_modes
-  def test_model(self, strategy_fn, use_operator=False, use_regularizer=False):
+  def test_model(self, strategy_fn, use_operator=False, use_regularizer=False,
+                 cloning=True):
     regularizer = IdentityRegularizer() if use_regularizer else None
     with strategy_fn().scope():
       with policy.policy_scope('infer_float32_vars'):
@@ -314,7 +319,7 @@ class KerasModelTest(test.TestCase, parameterized.TestCase):
         # the variable will not change. So this tests the learning rate not
         # applied to a float16 value, but instead the float32 variable.
         opt = gradient_descent.SGD(2 ** -14)
-        model.compile(opt, loss=loss_fn)
+        model.compile(opt, loss=loss_fn, cloning=cloning)
 
     self.assertEqual(backend.eval(layer.v), 1)
     x = np.ones((2, 1))
@@ -329,6 +334,53 @@ class KerasModelTest(test.TestCase, parameterized.TestCase):
       expected -= 2 ** -14
     self.assertEqual(backend.eval(layer.v), expected)
 
+  @parameterized.named_parameters({
+      'testcase_name': 'base',
+      'strategy_fn': default_strategy_fn
+  }, {
+      'testcase_name': 'distribute',
+      'strategy_fn': create_mirrored_strategy,
+  }, {
+      'testcase_name': 'nocloning',
+      'strategy_fn': create_mirrored_strategy,
+      'cloning': False,
+  })
+  @test_util.run_in_graph_and_eager_modes
+  def test_fixed_loss_scaling(self, strategy_fn, cloning=True):
+    # Note: We do not test mixed precision in this method, only loss scaling.
+    loss_scale = 8.
+    batch_size = 4
+    with strategy_fn().scope():
+      x = layers.Input(shape=(1,), batch_size=batch_size)
+      layer = AddLayer()
+      y = layer(x)
+
+      # The gradient of 'y' at this point is 1. With loss scaling, the gradient
+      # is 'loss_scale'. We divide by the batch size since the loss is averaged
+      # across batch elements.
+      expected_gradient = loss_scale / batch_size
+      identity_with_grad_check_fn = (
+          mp_test_util.create_identity_with_grad_check_fn([expected_gradient]))
+      y = core.Lambda(identity_with_grad_check_fn)(y)
+      model = models.Model(inputs=x, outputs=y)
+
+      def loss_fn(y_true, y_pred):
+        del y_true
+        return math_ops.reduce_mean(y_pred)
+
+      opt = gradient_descent.SGD(1.)
+      opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
+      model.compile(opt, loss=loss_fn, cloning=cloning)
+
+    self.assertEqual(backend.eval(layer.v), 1)
+    x = np.ones((batch_size, 1))
+    y = np.ones((batch_size, 1))
+    dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(batch_size)
+    model.fit(dataset)
+    # Variable starts at 1, and should have gradient of 1 subtracted from it.
+    expected = 0
+    self.assertEqual(backend.eval(layer.v), expected)
+
   @parameterized.named_parameters({
       'testcase_name': 'base',
       'strategy_fn': default_strategy_fn
@@ -405,15 +457,21 @@ class KerasModelTest(test.TestCase, parameterized.TestCase):
         # Layer does not have weight regularizer
         self.assertEqual(backend.eval(layer.v), 1 - learning_rate)
 
+  # TODO(reedwm): Add and fix test where cloning=False is passed to
+  # Model.compile. Currently the test fails if cloning=False is passed.
   @parameterized.named_parameters({
       'testcase_name': 'base',
       'strategy_fn': default_strategy_fn
   }, {
       'testcase_name': 'distribute',
       'strategy_fn': create_mirrored_strategy,
+  }, {
+      'testcase_name': 'nocloning',
+      'strategy_fn': create_mirrored_strategy,
+      'cloning': False,
   })
   @test_util.run_in_graph_and_eager_modes
-  def test_dynamic_loss_scaling(self, strategy_fn):
+  def test_dynamic_loss_scaling(self, strategy_fn, cloning=True):
     strategy = strategy_fn()
     initial_loss_scale = 2.
     batch_size = 4
@@ -447,12 +505,12 @@ class KerasModelTest(test.TestCase, parameterized.TestCase):
         loss_scale = loss_scale_module.DynamicLossScale(
             initial_loss_scale=initial_loss_scale, increment_period=2)
         opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
-        model.compile(opt, loss=loss_fn)
+        model.compile(opt, loss=loss_fn, cloning=cloning)
 
     self.assertEqual(backend.eval(layer.v), 1)
-    x = np.ones((2, 1))
-    y = np.ones((2, 1))
-    dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(2)
+    x = np.ones((batch_size, 1))
+    y = np.ones((batch_size, 1))
+    dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(batch_size)
     model.fit(dataset)
     # The variables starts with 1 and has a gradient of 1, so will go down by 1
     # each step.
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
index 03276263247..e506df0791a 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
@@ -41,6 +41,20 @@ class _UnwrapPreventer(object):
     self.value = value
 
 
+def scale_loss(loss, loss_scale):
+  """Scales the loss by the loss scale."""
+  if callable(loss):
+    return lambda: loss() * loss_scale
+  else:
+    return loss * loss_scale
+
+
+def unscale_grads(grads, loss_scale):
+  """Unscales the gradients by the loss scale."""
+  loss_scale_reciprocal = 1. / loss_scale
+  return [g * loss_scale_reciprocal if g is not None else None for g in grads]
+
+
 @keras_export('keras.mixed_precision.experimental.LossScaleOptimizer')
 class LossScaleOptimizer(optimizer_v2.OptimizerV2):
   """An optimizer that applies loss scaling.
@@ -98,38 +112,27 @@ class LossScaleOptimizer(optimizer_v2.OptimizerV2):
 
     self._optimizer = opt
     self._loss_scale = loss_scale_module.get(loss_scale)
+    self._track_trackable(self._optimizer, 'base_optimizer')
     self._track_trackable(self._loss_scale, 'loss_scale')
 
   def _compute_gradients(self, loss, var_list, grad_loss=None):
-    loss = self._scale_loss(loss)
+    loss = scale_loss(loss, self._loss_scale())
     grads_and_vars = self._optimizer._compute_gradients(loss, var_list,  # pylint: disable=protected-access
                                                         grad_loss)
     grads = [g for g, _ in grads_and_vars]
     variables = [v for _, v in grads_and_vars]
-    scaled_grads = self._scale_grads(grads)
-    return list(zip(scaled_grads, variables))
+    unscaled_grads = unscale_grads(grads, self._loss_scale())
+    return list(zip(unscaled_grads, variables))
 
   def get_gradients(self, loss, params):
-    loss = self._scale_loss(loss)
+    loss = scale_loss(loss, self._loss_scale())
     grads = self._optimizer.get_gradients(loss, params)
-    return self._scale_grads(grads)
-
-  def _scale_loss(self, loss):
-    # The loss is callable for `_compute_gradients`, but not `get_gradients`.
-    loss_scale = self._loss_scale()
-    if callable(loss):
-      return lambda: loss() * loss_scale
-    else:
-      return loss * loss_scale
-
-  def _scale_grads(self, grads):
-    loss_scale = self._loss_scale()
-    loss_scale_reciprocal = 1 / loss_scale
-    return [None if g is None else g * loss_scale_reciprocal for g in grads]
+    return unscale_grads(grads, self._loss_scale())
 
   def apply_gradients(self, grads_and_vars, name=None):
     if distribution_strategy_context.in_cross_replica_context():
       raise ValueError('apply_gradients() must be called in a replica context.')
+    grads_and_vars = tuple(grads_and_vars)
     return distribution_strategy_context.get_replica_context().merge_call(
         self._apply_gradients_cross_replica, args=(grads_and_vars, name))
 
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
index 83a03e0445d..e90d78d00d9 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
@@ -222,7 +222,7 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       loss_scale = loss_scale_module.DynamicLossScale(
           initial_loss_scale=1., increment_period=2.,
           multiplier=2.)
-      opt = gradient_descent.SGD(1.)
+      opt = gradient_descent.SGD(1., momentum=1.)
       opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
       run_fn = lambda: opt.minimize(lambda: var + 1., var_list=[var])
       opt_op = strategy.experimental_run(run_fn)
@@ -230,9 +230,11 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       self.evaluate(opt_op)
       self.assertEqual(self.evaluate(loss_scale()), 1.)
       self.assertEqual(self.evaluate(loss_scale._num_good_steps), 1)
+      slot_var = opt._optimizer.get_slot(var, 'momentum')
+      slot_value = self.evaluate(slot_var).item()
 
       # Save a checkpoint.
-      checkpoint = trackable_utils.Checkpoint(optimizer=opt)
+      checkpoint = trackable_utils.Checkpoint(optimizer=opt, var=var)
       prefix = os.path.join(self.get_temp_dir(), 'ckpt')
       save_path = checkpoint.save(prefix)
 
@@ -240,6 +242,7 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       self.evaluate(strategy.experimental_run(run_fn))
       self.assertEqual(self.evaluate(loss_scale()), 2.)
       self.assertEqual(self.evaluate(loss_scale._num_good_steps), 0)
+      self.assertNotAlmostEqual(self.evaluate(slot_var).item(), slot_value)
 
       # Load checkpoint and ensure loss scale is back to it's original value.
       status = checkpoint.restore(save_path)
@@ -247,6 +250,7 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       status.run_restore_ops()
       self.assertEqual(self.evaluate(loss_scale()), 1.)
       self.assertEqual(self.evaluate(loss_scale._num_good_steps), 1)
+      self.assertAlmostEqual(self.evaluate(slot_var).item(), slot_value)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py
index 3dd83df141a..6ae8795c837 100644
--- a/tensorflow/python/keras/models.py
+++ b/tensorflow/python/keras/models.py
@@ -406,12 +406,8 @@ def _in_place_subclassed_model_reset(model):
       attributes_to_cache = [
           'inputs',
           'outputs',
-          '_feed_output_names',
-          '_feed_output_shapes',
-          '_feed_loss_fns',
           'loss_weights_list',
-          'targets',
-          '_feed_targets',
+          '_training_targets',
           '_sample_weight_modes',
           'total_loss',
           'sample_weights',
diff --git a/tensorflow/python/keras/optimizer_v2/BUILD b/tensorflow/python/keras/optimizer_v2/BUILD
index 14884d90db9..0791fd3285c 100644
--- a/tensorflow/python/keras/optimizer_v2/BUILD
+++ b/tensorflow/python/keras/optimizer_v2/BUILD
@@ -203,6 +203,7 @@ py_test(
     name = "optimizer_v2_test",
     size = "medium",
     srcs = ["optimizer_v2_test.py"],
+    python_version = "PY2",
     shard_count = 8,
     tags = [
         "no_gpu",  # b/127001953
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
index f0219881d30..0adacd25859 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -674,7 +674,7 @@ class OptimizerV2(trackable.Trackable):
     if "learning_rate" in config:
       if isinstance(config["learning_rate"], dict):
         config["learning_rate"] = learning_rate_schedule.deserialize(
-            config["learning_rate"])
+            config["learning_rate"], custom_objects=custom_objects)
     return cls(**config)
 
   def _serialize_hyperparameter(self, hyperparameter_name):
diff --git a/tensorflow/python/keras/saving/saved_model.py b/tensorflow/python/keras/saving/saved_model.py
index b853785a00f..866b884f1c3 100644
--- a/tensorflow/python/keras/saving/saved_model.py
+++ b/tensorflow/python/keras/saving/saved_model.py
@@ -298,7 +298,11 @@ def _export_mode(
       builder.add_meta_graph(
           model_utils.EXPORT_TAG_MAP[mode],
           signature_def_map=_create_signature_def_map(clone, mode),
-          saver=saver_lib.Saver(clone_var_list),
+          saver=saver_lib.Saver(
+              clone_var_list,
+              # Allow saving Models with no variables. This is somewhat odd, but
+              # it's not necessarily a bug.
+              allow_empty=True),
           init_op=variables.local_variables_initializer(),
           train_op=train_op)
     return None
@@ -309,7 +313,7 @@ def _create_signature_def_map(model, mode):
   inputs_dict = {name: x for name, x in zip(model.input_names, model.inputs)}
   if model.optimizer:
     targets_dict = {x.name.split(':')[0]: x
-                    for x in model.targets if x is not None}
+                    for x in model._targets if x is not None}
     inputs_dict.update(targets_dict)
   outputs_dict = {name: x
                   for name, x in zip(model.output_names, model.outputs)}
diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
index 81a9452f6a8..86ecb0f2822 100644
--- a/tensorflow/python/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -406,7 +406,7 @@ class _SubclassModelCustomBuild(keras.Model):
     return x
 
 
-def get_model_from_layers(layers, input_shape=None):
+def get_model_from_layers(layers, input_shape=None, input_dtype=None):
   """Builds a model from a sequence of layers."""
   model_type = get_model_type()
   if model_type == 'subclass':
@@ -419,7 +419,8 @@ def get_model_from_layers(layers, input_shape=None):
   if model_type == 'sequential':
     model = keras.models.Sequential()
     if input_shape:
-      model.add(keras.layers.InputLayer(input_shape=input_shape))
+      model.add(keras.layers.InputLayer(input_shape=input_shape,
+                                        dtype=input_dtype))
     for layer in layers:
       model.add(layer)
     return model
@@ -428,7 +429,7 @@ def get_model_from_layers(layers, input_shape=None):
     if not input_shape:
       raise ValueError('Cannot create a functional model from layers with no '
                        'input shape.')
-    inputs = keras.Input(shape=input_shape)
+    inputs = keras.Input(shape=input_shape, dtype=input_dtype)
     outputs = inputs
     for layer in layers:
       outputs = layer(outputs)
diff --git a/tensorflow/python/keras/utils/metrics_utils.py b/tensorflow/python/keras/utils/metrics_utils.py
index 29bf2e7de0d..ce1eb3f1dcf 100644
--- a/tensorflow/python/keras/utils/metrics_utils.py
+++ b/tensorflow/python/keras/utils/metrics_utils.py
@@ -27,6 +27,7 @@ from enum import Enum
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras.utils.generic_utils import to_list
 from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
 from tensorflow.python.ops import array_ops
@@ -68,7 +69,8 @@ def update_state_wrapper(update_state_fn):
   def decorated(metric_obj, *args, **kwargs):
     """Decorated function with `add_update()`."""
 
-    update_op = update_state_fn(*args, **kwargs)
+    with tf_utils.graph_context_for_symbolic_tensors(*args, **kwargs):
+      update_op = update_state_fn(*args, **kwargs)
     if update_op is not None:  # update_op will be None in eager execution.
       metric_obj.add_update(update_op, inputs=True)
     return update_op
diff --git a/tensorflow/python/keras/utils/tf_utils.py b/tensorflow/python/keras/utils/tf_utils.py
index 3ad78b774d3..beed7ceef67 100644
--- a/tensorflow/python/keras/utils/tf_utils.py
+++ b/tensorflow/python/keras/utils/tf_utils.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond as smart_module
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.keras import backend as K
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.util import nest
@@ -415,3 +416,13 @@ def maybe_init_scope(layer):
       yield
   else:
     yield
+
+
+@tf_contextlib.contextmanager
+def graph_context_for_symbolic_tensors(*args, **kwargs):
+  """Returns graph context manager if any of the inputs is a symbolic tensor."""
+  if any(is_symbolic_tensor(v) for v in list(args) + list(kwargs.values())):
+    with K.get_graph().as_default():
+      yield
+  else:
+    yield
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 4196dac671f..32951c33776 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -467,6 +467,15 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "fingerprint_op_test",
+    size = "small",
+    srcs = ["fingerprint_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+    ],
+)
+
 tf_py_test(
     name = "fractional_avg_pool_op_test",
     size = "small",
@@ -1748,6 +1757,9 @@ cuda_py_test(
         "//tensorflow/python:while_v2",
     ],
     shard_count = 16,
+    tags = [
+        "notsan",  # TODO(b/132205147): Re-enable this.
+    ],
     xla_enable_strict_auto_jit = True,
 )
 
@@ -1776,7 +1788,6 @@ tf_py_test(
         "//tensorflow/python:control_flow_util_v2",
         "//tensorflow/python:while_v2",
     ],
-    tags = ["no_gpu"],  # TODO(b/117796385): runs out of memory
 )
 
 cuda_py_test(
@@ -3231,6 +3242,7 @@ cuda_py_test(
         "//tensorflow/python:util",
         "//tensorflow/python:data_flow_ops",
     ],
+    tags = ["no_oss"],  # b/124474135
     xla_enable_strict_auto_jit = True,
 )
 
@@ -3423,7 +3435,9 @@ cuda_py_test(
         "no_rocm",  # flaky test
         "no_windows",
     ],
-    # b/127344411: xla_enable_strict_auto_jit = True,
+    # TODO(b/127344411): This test passes because XLA does not actually cluster
+    # the self_adjoint_eig op.
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3460,7 +3474,9 @@ cuda_py_test(
         "no_oss",  # b/117185141.
         "nomsan",  # TODO(b/117236102): Re-enable in msan build.
     ],
-    # b/127344411: xla_enable_strict_auto_jit = True,
+    # TODO(b/127344411): This test passes because XLA does not actually cluster
+    # the svd op.
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3480,7 +3496,7 @@ cuda_py_test(
         "no_windows_gpu",
         "nomsan",
     ],
-    # b/127344411: xla_enable_strict_auto_jit = True,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 5d91e5f9c9c..feb10431d40 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -448,12 +448,8 @@ class ControlFlowTest(test.TestCase):
       values = constant_op.constant(10)
       indices = constant_op.constant(0)
       x = ops.IndexedSlices(values, indices)
-      v1_msg = "The two structures don't have the same nested structure"
-      v2_msg = ("true_fn and false_fn arguments to tf.cond must have the same "
-                "number, type, and overall structure of return values.")
       with self.assertRaisesRegexp(
-          TypeError,
-          v2_msg if control_flow_util.ENABLE_CONTROL_FLOW_V2 else v1_msg):
+          TypeError, "Cannot reconcile tf.cond 0-th outputs"):
         control_flow_ops.cond(
             constant_op.constant(True),
             lambda: ops.IndexedSlices(math_ops.add(x.values, 1), indices),
@@ -516,7 +512,6 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual(sess.run(g, {pred: True}), [2.0, 2.0, 2.0])
       self.assertAllEqual(sess.run(g, {pred: False}), [0.0, 0.0, 0.0])
 
-  @test_util.disable_control_flow_v2("b/113293074")
   @test_util.run_v1_only("b/120545219")
   def testCondIndexedSlicesDifferentTypes(self):
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index 21af0d28322..1cb9cfa7479 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -1117,7 +1117,7 @@ class SingularGradientOpTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def testGradientAtSingularity(self):
-    if not compat.forward_compatible(2019, 5, 14):
+    if not compat.forward_compatible(2019, 6, 14):
       self.skipTest("Skipping test for future functionality.")
 
     ops_and_singularity = [
diff --git a/tensorflow/python/kernel_tests/fingerprint_op_test.py b/tensorflow/python/kernel_tests/fingerprint_op_test.py
new file mode 100644
index 00000000000..0af3f5182fa
--- /dev/null
+++ b/tensorflow/python/kernel_tests/fingerprint_op_test.py
@@ -0,0 +1,42 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.fingerprint_op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+# Fingerprint op has C++ tests. This simple test case tests that fingerprint
+# function is accessible via Python API.
+class FingerprintTest(test.TestCase):
+
+  def test_default_values(self):
+    data = np.arange(10)
+    data = np.expand_dims(data, axis=0)
+    fingerprint0 = self.evaluate(array_ops.fingerprint(data))
+    fingerprint1 = self.evaluate(array_ops.fingerprint(data[:, 1:]))
+    self.assertEqual(fingerprint0.ndim, 2)
+    self.assertTupleEqual(fingerprint0.shape, fingerprint1.shape)
+    self.assertTrue(np.any(fingerprint0 != fingerprint1))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index 91683047a8f..29e06534b72 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
+from tensorflow.python.eager import def_function as eager_def_function
 from tensorflow.python.eager import function as eager_function
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import constant_op
@@ -943,6 +944,35 @@ class FunctionalOpsTest(test.TestCase):
 # below test cases.
 class PartitionedCallTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
+  def testRemoteDeviceInPartitionedCallOp(self):
+    workers, _ = test_util.create_local_cluster(2, 0)
+
+    worker0_device = "/job:worker/replica:0/task:0/cpu:0"
+    worker1_device = "/job:worker/replica:0/task:1/cpu:0"
+
+    @eager_def_function.function
+    def f(a, b):
+      return a + b
+
+    with session.Session(workers[0].target) as sess:
+      with ops.device(worker0_device):
+        a = variable_scope.get_variable(
+            "a", initializer=constant_op.constant(1.), use_resource=True)
+      with ops.device(worker1_device):
+        b = variable_scope.get_variable(
+            "b", initializer=constant_op.constant(1.), use_resource=True)
+
+      sess.run(variables.global_variables_initializer())
+
+    config = config_pb2.ConfigProto()
+    config.experimental.share_cluster_devices_in_session = True
+
+    with session.Session(workers[0].target, config=config) as sess:
+      res = sess.run(f(a, b))
+
+    self.assertEqual(res, 2)
+
   @test_util.run_deprecated_v1
   def testBasicSingleDevice(self):
 
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
index c62f3f0fed4..02e9a6103e1 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
@@ -288,6 +288,75 @@ class LinearOperatorTest(test.TestCase):
     self.assertTrue(operator2.matmul(operator1).is_square)
     self.assertFalse(operator1.matmul(operator3).is_square)
 
+  def testDispatchedMethods(self):
+    operator = linalg.LinearOperatorFullMatrix(
+        [[1., 0.5], [0.5, 1.]],
+        is_square=True,
+        is_self_adjoint=True,
+        is_non_singular=True,
+        is_positive_definite=True)
+    methods = {
+        "trace": linalg.trace,
+        "diag_part": linalg.diag_part,
+        "log_abs_determinant": linalg.logdet,
+        "determinant": linalg.det
+    }
+    for method in methods:
+      op_val = getattr(operator, method)()
+      linalg_val = methods[method](operator)
+      self.assertAllClose(
+          self.evaluate(op_val),
+          self.evaluate(linalg_val))
+    # Solve and Matmul go here.
+
+    adjoint = linalg.adjoint(operator)
+    self.assertIsInstance(adjoint, linalg.LinearOperator)
+    cholesky = linalg.cholesky(operator)
+    self.assertIsInstance(cholesky, linalg.LinearOperator)
+    inverse = linalg.inv(operator)
+    self.assertIsInstance(inverse, linalg.LinearOperator)
+
+  def testDispatchMatmulSolve(self):
+    operator = linalg.LinearOperatorFullMatrix(
+        np.float64([[1., 0.5], [0.5, 1.]]),
+        is_square=True,
+        is_self_adjoint=True,
+        is_non_singular=True,
+        is_positive_definite=True)
+    rhs = np.random.uniform(-1., 1., size=[3, 2, 2])
+    for adjoint in [False, True]:
+      for adjoint_arg in [False, True]:
+        op_val = operator.matmul(
+            rhs, adjoint=adjoint, adjoint_arg=adjoint_arg)
+        matmul_val = math_ops.matmul(
+            operator, rhs, adjoint_a=adjoint, adjoint_b=adjoint_arg)
+        self.assertAllClose(
+            self.evaluate(op_val), self.evaluate(matmul_val))
+
+      op_val = operator.solve(rhs, adjoint=adjoint)
+      solve_val = linalg.solve(operator, rhs, adjoint=adjoint)
+      self.assertAllClose(
+          self.evaluate(op_val), self.evaluate(solve_val))
+
+  def testDispatchMatmulLeftOperatorIsTensor(self):
+    mat = np.float64([[1., 0.5], [0.5, 1.]])
+    right_operator = linalg.LinearOperatorFullMatrix(
+        mat,
+        is_square=True,
+        is_self_adjoint=True,
+        is_non_singular=True,
+        is_positive_definite=True)
+    lhs = np.random.uniform(-1., 1., size=[3, 2, 2])
+
+    for adjoint in [False, True]:
+      for adjoint_arg in [False, True]:
+        op_val = math_ops.matmul(
+            lhs, mat, adjoint_a=adjoint, adjoint_b=adjoint_arg)
+        matmul_val = math_ops.matmul(
+            lhs, right_operator, adjoint_a=adjoint, adjoint_b=adjoint_arg)
+        self.assertAllClose(
+            self.evaluate(op_val), self.evaluate(matmul_val))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/matmul_op_test.py b/tensorflow/python/kernel_tests/matmul_op_test.py
index 81f7290f07d..a3dd7dbf2af 100644
--- a/tensorflow/python/kernel_tests/matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/matmul_op_test.py
@@ -99,8 +99,8 @@ def _GetMatMulTest(a_np_, b_np_, use_static_shape_, **kwargs_):
     self.assertAllCloseAccordingToType(
         tf_val,
         np_val,
-        float_rtol=2e-5,
-        float_atol=2e-5,
+        float_rtol=3e-5,
+        float_atol=3e-5,
         half_rtol=0.2,
         half_atol=0.2)
 
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index 82625c9b23e..683315b8195 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -789,7 +789,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
           [assign],
           feed_dict={placeholder: np.zeros(shape=[2, 2], dtype=np.float32)})
 
-  def testAssignDifferentShapesEager(self):
+  def testAssignDifferentShapesEagerNotAllowed(self):
     with context.eager_mode():
       with variable_scope.variable_scope("foo"):
         var = variable_scope.get_variable("x", shape=[1, 1],
@@ -799,6 +799,18 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
           assign = var.assign(np.zeros(shape=[2, 2]))
           self.evaluate(assign)
 
+  @test_util.disable_xla("XLA doesn't allow changing shape at assignment, as "
+                         "dictated by tf2xla/xla_resource.cc:SetTypeAndShape")
+  @test_util.run_in_graph_and_eager_modes
+  def testAssignDifferentShapesAllowed(self):
+    var = resource_variable_ops.ResourceVariable(
+        initial_value=np.zeros(shape=[1, 1]),
+        shape=tensor_shape.TensorShape(None))
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllEqual(np.zeros(shape=[1, 1]), var.read_value())
+    self.evaluate(var.assign(np.zeros(shape=[2, 2])))
+    self.assertAllEqual(np.zeros(shape=[2, 2]), var.read_value())
+
   @test_util.run_deprecated_v1
   def testDtypeAfterFromProto(self):
     v = resource_variable_ops.ResourceVariable(2.0)
diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py
index 4c320e7042d..ea69dff50a1 100644
--- a/tensorflow/python/kernel_tests/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables_test.py
@@ -23,11 +23,13 @@ import operator
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -153,6 +155,22 @@ class VariablesTestCase(test.TestCase):
       self.evaluate(four)
       self.assertAllClose(4.0, self.evaluate(var))
 
+  def testAssignDifferentShapesEagerNotAllowed(self):
+    with context.eager_mode():
+      var = variables.Variable(np.zeros(shape=[1, 1]))
+      with self.assertRaisesRegexp(ValueError,
+                                   "Shapes.*and.*are incompatible"):
+        var.assign(np.zeros(shape=[2, 2]))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testAssignDifferentShapesAllowed(self):
+    var = variables.Variable(np.zeros(shape=[1, 1]),
+                             shape=tensor_shape.TensorShape(None))
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllEqual(np.zeros(shape=[1, 1]), var.read_value())
+    self.evaluate(var.assign(np.zeros(shape=[2, 2])))
+    self.assertAllEqual(np.zeros(shape=[2, 2]), var.read_value())
+
   def testZeroSizeStringAssign(self):
     with self.cached_session() as sess:
       array = variables.VariableV1(
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 11ef64d3a4f..d587a46974a 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -3943,3 +3943,52 @@ def extract_image_patches(  # pylint: disable=missing-docstring
 
 
 extract_image_patches.__doc__ = gen_array_ops.extract_image_patches.__doc__
+
+
+@tf_export("fingerprint")
+def fingerprint(data, method="farmhash64", name=None):
+  r"""Generates fingerprint values.
+
+  Generates fingerprint values of `data`.
+
+  Fingerprint op considers the first dimension of `data` as the batch dimension,
+  and `output[i]` contains the fingerprint value generated from contents in
+  `data[i, ...]` for all `i`.
+
+  Fingerprint op writes fingerprint values as byte arrays. For example, the
+  default method `farmhash64` generates a 64-bit fingerprint value at a time.
+  This 8-byte value is written out as an `tf.uint8` array of size 8, in
+  little-endian order.
+
+  For example, suppose that `data` has data type `tf.int32` and shape (2, 3, 4),
+  and that the fingerprint method is `farmhash64`. In this case, the output
+  shape is (2, 8), where 2 is the batch dimension size of `data`, and 8 is the
+  size of each fingerprint value in bytes. `output[0, :]` is generated from
+  12 integers in `data[0, :, :]` and similarly `output[1, :]` is generated from
+  other 12 integers in `data[1, :, :]`.
+
+  Note that this op fingerprints the raw underlying buffer, and it does not
+  fingerprint Tensor's metadata such as data type and/or shape. For example, the
+  fingerprint values are invariant under reshapes and bitcasts as long as the
+  batch dimension remain the same:
+
+  ```python
+  tf.fingerprint(data) == tf.fingerprint(tf.reshape(data, ...))
+  tf.fingerprint(data) == tf.fingerprint(tf.bitcast(data, ...))
+  ```
+
+  For string data, one should expect `tf.fingerprint(data) !=
+  tf.fingerprint(tf.string.reduce_join(data))` in general.
+
+  Args:
+    data: A `Tensor`. Must have rank 1 or higher.
+    method: A `Tensor` of type `tf.string`. Fingerprint method used by this op.
+      Currently available method is `farmhash64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A two-dimensional `Tensor` of type `tf.uint8`. The first dimension equals to
+    `data`'s first dimension, and the second dimension size depends on the
+    fingerprint algorithm.
+  """
+  return gen_array_ops.fingerprint(data, method, name)
diff --git a/tensorflow/python/ops/cond_v2.py b/tensorflow/python/ops/cond_v2.py
index 38bcb84a2aa..5d661397b3d 100644
--- a/tensorflow/python/ops/cond_v2.py
+++ b/tensorflow/python/ops/cond_v2.py
@@ -185,6 +185,7 @@ def _build_cond(pred, true_graph, false_graph, true_inputs, false_inputs,
     A list of Tensors which are the outputs of the If op. Does not include added
     intermediate outputs.
   """
+  _make_indexed_slices_indices_types_match(true_graph, false_graph)
   _check_same_outputs(true_graph, false_graph)
 
   # Add inputs to true_graph and false_graph to make them match. Note that
@@ -522,6 +523,63 @@ def _make_output_composite_tensors_match(true_graph, false_graph):
   false_graph.outputs = func_graph_module.flatten(false_outputs)
 
 
+def _make_indexed_slices_indices_types_match(true_graph, false_graph):
+  """Match dtype of IndexedSlices.indices in outputs of {true|false}_graphs."""
+  indexed_slice_indices = []
+  current_index = 0
+  true_outputs_flat_with_composites = nest.flatten(
+      true_graph.structured_outputs, expand_composites=False)
+  false_outputs_flat_with_composites = nest.flatten(
+      false_graph.structured_outputs, expand_composites=False)
+  # Store indices of IndexedSlices.indices in `indexed_slice_indices`.
+  for idx, (true_out, false_out) in enumerate(
+      zip(true_outputs_flat_with_composites,
+          false_outputs_flat_with_composites)):
+    if isinstance(true_out, ops.IndexedSlices) != isinstance(
+        false_out, ops.IndexedSlices):
+      raise TypeError("Cannot reconcile tf.cond %i-th outputs:\n"
+                      "  true_fn returned:  %s\n"
+                      "  false_fn returned: %s" % (idx, true_out, false_out))
+    if isinstance(true_out, ops.IndexedSlices):
+      # indices is the second component of the composite tensor.
+      indexed_slice_indices.append(current_index + 1)
+    if nest.is_sequence_or_composite(true_out):
+      current_index += len(nest.flatten(true_out, expand_composites=True))
+    else:
+      current_index += 1
+
+  if not indexed_slice_indices:
+    return
+
+  if current_index != len(true_graph.outputs):
+    raise ValueError("Insufficient elements in true_graph.outputs.\n"
+                     "Expected: %i\n"
+                     "Actual: %i" % (current_index, len(true_graph.outputs)))
+
+  # Cast indices with mismatching types to int64.
+  for index in indexed_slice_indices:
+    if true_graph.outputs[index].dtype not in (dtypes.int32, dtypes.int64):
+      raise TypeError("Type of IndexedSlices.indices must be int32 or int64. "
+                      "Found: %s" % str(true_graph.outputs[index].dtype))
+    if false_graph.outputs[index].dtype not in (dtypes.int32, dtypes.int64):
+      raise TypeError("Type of IndexedSlices.indices must be int32 or int64. "
+                      "Found: %s" % str(false_graph.outputs[index].dtype))
+    if true_graph.outputs[index].dtype != false_graph.outputs[index].dtype:
+      if false_graph.outputs[index].dtype == dtypes.int32:
+        with false_graph.as_default():
+          false_graph.outputs[index] = math_ops.cast(false_graph.outputs[index],
+                                                     dtypes.int64)
+      else:
+        with true_graph.as_default():
+          true_graph.outputs[index] = math_ops.cast(true_graph.outputs[index],
+                                                    dtypes.int64)
+
+  true_graph.structured_outputs = func_graph_module.pack_sequence_as(
+      true_graph.structured_outputs, true_graph.outputs)
+  false_graph.structured_outputs = func_graph_module.pack_sequence_as(
+      false_graph.structured_outputs, false_graph.outputs)
+
+
 def _wrap_intermediates(func_graph, intermediates):
   with func_graph.as_default():
     return [gen_dataset_ops.optional_from_value([t]) for t in intermediates]
diff --git a/tensorflow/python/ops/gradient_checker.py b/tensorflow/python/ops/gradient_checker.py
index 683f78ce9b2..2ce28e8e1ab 100644
--- a/tensorflow/python/ops/gradient_checker.py
+++ b/tensorflow/python/ops/gradient_checker.py
@@ -284,10 +284,10 @@ def compute_gradient(x,
   numbers.  For example, if `x` is complex with shape `[m]` and `y` is complex
   with shape `[n]`, each Jacobian `J` will have shape `[m * 2, n * 2]` with
 
-      J[::2, ::2] = d(Re y)/d(Re x)
-      J[::2, 1::2] = d(Im y)/d(Re x)
-      J[1::2, ::2] = d(Re y)/d(Im x)
-      J[1::2, 1::2] = d(Im y)/d(Im x)
+      J[:m, :n] = d(Re y)/d(Re x)
+      J[:m, n:] = d(Im y)/d(Re x)
+      J[m:, :n] = d(Re y)/d(Im x)
+      J[m:, n:] = d(Im y)/d(Im x)
 
   Args:
     x: a tensor or list of tensors
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 7af9e319c35..e8a2cc98509 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -2139,7 +2139,11 @@ tf_export(
     'io.decode_image',
     'image.decode_image',
     v1=['io.decode_image', 'image.decode_image'])
-def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
+def decode_image(contents,
+                 channels=None,
+                 dtype=dtypes.uint8,
+                 name=None,
+                 expand_animations=True):
   """Function for `decode_bmp`, `decode_gif`, `decode_jpeg`, and `decode_png`.
 
   Detects whether an image is a BMP, GIF, JPEG, or PNG, and performs the
@@ -2150,7 +2154,9 @@ def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
   opposed to `decode_bmp`, `decode_jpeg` and `decode_png`, which return 3-D
   arrays `[height, width, num_channels]`. Make sure to take this into account
   when constructing your graph if you are intermixing GIF files with BMP, JPEG,
-  and/or PNG files.
+  and/or PNG files. Alternately, set the `expand_animations` argument of this
+  function to `False`, in which case the op will return 3-dimensional tensors
+  and will truncate animated GIF files to the first frame.
 
   Args:
     contents: 0-D `string`. The encoded image bytes.
@@ -2158,11 +2164,15 @@ def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
       the decoded image.
     dtype: The desired DType of the returned `Tensor`.
     name: A name for the operation (optional)
+    expand_animations: Controls the shape of the returned op's output.
+      If `True`, the returned op will produce a 3-D tensor for PNG, JPEG, and
+      BMP files; and a 4-D tensor for all GIFs, whether animated or not.
+      If, `False`, the returned op will produce a 3-D tensor for all file 
+      types and will truncate animated GIFs to the first frame.
 
   Returns:
-    `Tensor` with type `dtype` and shape `[height, width, num_channels]` for
-      BMP, JPEG, and PNG images and shape `[num_frames, height, width, 3]` for
-      GIF images.
+    `Tensor` with type `dtype` and a 3- or 4-dimensional shape, depending on
+    the file type and the value of the `expand_animations` parameter.
 
   Raises:
     ValueError: On incorrect number of channels.
@@ -2173,7 +2183,7 @@ def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
     substr = string_ops.substr(contents, 0, 3)
 
     def _bmp():
-      """Decodes a GIF image."""
+      """Decodes a BMP image."""
       signature = string_ops.substr(contents, 0, 2)
       # Create assert op to check that bytes are BMP decodable
       is_bmp = math_ops.equal(signature, 'BM', name='is_bmp')
@@ -2187,9 +2197,9 @@ def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
         return convert_image_dtype(gen_image_ops.decode_bmp(contents), dtype)
 
     def _gif():
+      """Decodes a GIF image."""
       # Create assert to make sure that channels is not set to 1
       # Already checked above that channels is in (None, 0, 1, 3)
-
       gif_channels = 0 if channels is None else channels
       good_channels = math_ops.logical_and(
           math_ops.not_equal(gif_channels, 1, name='check_gif_channels'),
@@ -2197,7 +2207,12 @@ def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
       channels_msg = 'Channels must be in (None, 0, 3) when decoding GIF images'
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
       with ops.control_dependencies([assert_channels]):
-        return convert_image_dtype(gen_image_ops.decode_gif(contents), dtype)
+        result = convert_image_dtype(gen_image_ops.decode_gif(contents), dtype)
+        if not expand_animations:
+          # For now we decode animated GIFs fully and toss out all but the
+          # first frame when expand_animations is False
+          result = array_ops.gather(result, 0)
+        return result
 
     def check_gif():
       # Create assert op to check that bytes are GIF decodable
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 79f4a54135e..3b7b699d501 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -5123,6 +5123,21 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       image0, image1 = self.evaluate([image0, image1])
       self.assertAllEqual(image0, image1)
 
+  def testExpandAnimations(self):
+    with self.cached_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/gif/testdata"
+      gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
+      image0 = image_ops.decode_image(
+          gif0, dtype=dtypes.float32, expand_animations=False)
+      # image_ops.decode_png() handles GIFs and returns 3D tensors
+      animation = image_ops.decode_gif(gif0)
+      first_frame = array_ops.gather(animation, 0)
+      image1 = image_ops.convert_image_dtype(first_frame, dtypes.float32)
+      image0, image1 = self.evaluate([image0, image1])
+      self.assertEqual(len(image0.shape), 3)
+      self.assertAllEqual(list(image0.shape), [40, 20, 3])
+      self.assertAllEqual(image0, image1)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/linalg/linalg_impl.py b/tensorflow/python/ops/linalg/linalg_impl.py
index 31b0bef6ab4..5a3d86c1b68 100644
--- a/tensorflow/python/ops/linalg/linalg_impl.py
+++ b/tensorflow/python/ops/linalg/linalg_impl.py
@@ -28,22 +28,23 @@ from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import special_math_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 # Linear algebra ops.
 band_part = array_ops.matrix_band_part
-cholesky = linalg_ops.cholesky
+cholesky = dispatch.add_dispatch_support(linalg_ops.cholesky)
 cholesky_solve = linalg_ops.cholesky_solve
-det = linalg_ops.matrix_determinant
+det = dispatch.add_dispatch_support(linalg_ops.matrix_determinant)
 slogdet = gen_linalg_ops.log_matrix_determinant
 tf_export('linalg.slogdet')(slogdet)
 diag = array_ops.matrix_diag
-diag_part = array_ops.matrix_diag_part
+diag_part = dispatch.add_dispatch_support(array_ops.matrix_diag_part)
 eigh = linalg_ops.self_adjoint_eig
 eigvalsh = linalg_ops.self_adjoint_eigvals
 einsum = special_math_ops.einsum
 eye = linalg_ops.eye
-inv = linalg_ops.matrix_inverse
+inv = dispatch.add_dispatch_support(linalg_ops.matrix_inverse)
 logm = gen_linalg_ops.matrix_logarithm
 lu = gen_linalg_ops.lu
 tf_export('linalg.logm')(logm)
@@ -51,16 +52,17 @@ lstsq = linalg_ops.matrix_solve_ls
 norm = linalg_ops.norm
 qr = linalg_ops.qr
 set_diag = array_ops.matrix_set_diag
-solve = linalg_ops.matrix_solve
+solve = dispatch.add_dispatch_support(linalg_ops.matrix_solve)
 sqrtm = linalg_ops.matrix_square_root
 svd = linalg_ops.svd
 tensordot = math_ops.tensordot
-trace = math_ops.trace
+trace = dispatch.add_dispatch_support(math_ops.trace)
 transpose = array_ops.matrix_transpose
 triangular_solve = linalg_ops.matrix_triangular_solve
 
 
 @tf_export('linalg.logdet')
+@dispatch.add_dispatch_support
 def logdet(matrix, name=None):
   """Computes log of the determinant of a hermitian positive definite matrix.
 
@@ -94,6 +96,7 @@ def logdet(matrix, name=None):
 
 
 @tf_export('linalg.adjoint')
+@dispatch.add_dispatch_support
 def adjoint(matrix, name=None):
   """Transposes the last two dimensions of and conjugates tensor `matrix`.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index 80c91695360..f9128cc57a6 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -35,6 +35,7 @@ from tensorflow.python.ops.linalg import linalg_impl as linalg
 from tensorflow.python.ops.linalg import linear_operator_algebra
 from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 __all__ = ["LinearOperator"]
@@ -207,12 +208,13 @@ class LinearOperator(object):
     self._name = name or type(self).__name__
 
   @contextlib.contextmanager
-  def _name_scope(self, name=None, values=None):
+  def _name_scope(self, name=None):
     """Helper function to standardize op scope."""
-    with ops.name_scope(self.name):
-      with ops.name_scope(
-          name, values=((values or []) + self._graph_parents)) as scope:
-        yield scope
+    full_name = self.name
+    if name is not None:
+      full_name += "/" + name
+    with ops.name_scope(full_name) as scope:
+      yield scope
 
   @property
   def dtype(self):
@@ -610,7 +612,7 @@ class LinearOperator(object):
       with self._name_scope(name):
         return linear_operator_algebra.matmul(left_operator, right_operator)
 
-    with self._name_scope(name, values=[x]):
+    with self._name_scope(name):
       x = ops.convert_to_tensor(x, name="x")
       self._check_input_dtype(x)
 
@@ -654,7 +656,7 @@ class LinearOperator(object):
     Returns:
       A `Tensor` with shape `[..., M]` and same `dtype` as `self`.
     """
-    with self._name_scope(name, values=[x]):
+    with self._name_scope(name):
       x = ops.convert_to_tensor(x, name="x")
       self._check_input_dtype(x)
       self_dim = -2 if adjoint else -1
@@ -796,7 +798,7 @@ class LinearOperator(object):
       with self._name_scope(name):
         return linear_operator_algebra.solve(left_operator, right_operator)
 
-    with self._name_scope(name, values=[rhs]):
+    with self._name_scope(name):
       rhs = ops.convert_to_tensor(rhs, name="rhs")
       self._check_input_dtype(rhs)
 
@@ -853,7 +855,7 @@ class LinearOperator(object):
     Raises:
       NotImplementedError:  If `self.is_non_singular` or `is_square` is False.
     """
-    with self._name_scope(name, values=[rhs]):
+    with self._name_scope(name):
       rhs = ops.convert_to_tensor(rhs, name="rhs")
       self._check_input_dtype(rhs)
       self_dim = -1 if adjoint else -2
@@ -1019,10 +1021,97 @@ class LinearOperator(object):
     Returns:
       A `Tensor` with broadcast shape and same `dtype` as `self`.
     """
-    with self._name_scope(name, values=[x]):
+    with self._name_scope(name):
       x = ops.convert_to_tensor(x, name="x")
       self._check_input_dtype(x)
       return self._add_to_tensor(x)
 
   def _can_use_cholesky(self):
     return self.is_self_adjoint and self.is_positive_definite
+
+
+# Overrides for tf.linalg functions. This allows a LinearOperator to be used in
+# place of a Tensor.
+# For instance tf.trace(linop) and linop.trace() both work.
+
+
+@dispatch.dispatch_for_types(linalg.adjoint, LinearOperator)
+def _adjoint(matrix, name=None):
+  return matrix.adjoint(name)
+
+
+@dispatch.dispatch_for_types(linalg.cholesky, LinearOperator)
+def _cholesky(input, name=None):   # pylint:disable=redefined-builtin
+  return input.cholesky(name)
+
+
+@dispatch.dispatch_for_types(linalg.diag_part, LinearOperator)
+def _diag_part(input, name=None):   # pylint:disable=redefined-builtin
+  return input.diag_part(name)
+
+
+@dispatch.dispatch_for_types(linalg.det, LinearOperator)
+def _det(input, name=None):  # pylint:disable=redefined-builtin
+  return input.determinant(name)
+
+
+@dispatch.dispatch_for_types(linalg.inv, LinearOperator)
+def _inverse(input, adjoint=False, name=None):   # pylint:disable=redefined-builtin
+  inv = input.inverse(name)
+  if adjoint:
+    inv = inv.adjoint()
+  return inv
+
+
+@dispatch.dispatch_for_types(linalg.logdet, LinearOperator)
+def _logdet(matrix, name=None):
+  if matrix.is_positive_definite and matrix.is_self_adjoint:
+    return matrix.log_abs_determinant(name)
+  raise ValueError("Expected matrix to be self-adjoint positive definite.")
+
+
+@dispatch.dispatch_for_types(math_ops.matmul, LinearOperator)
+def _matmul(  # pylint:disable=missing-docstring
+    a,
+    b,
+    transpose_a=False,
+    transpose_b=False,
+    adjoint_a=False,
+    adjoint_b=False,
+    a_is_sparse=False,
+    b_is_sparse=False,
+    name=None):
+  if transpose_a or transpose_b:
+    raise ValueError("Transposing not supported at this time.")
+  if a_is_sparse or b_is_sparse:
+    raise ValueError("Sparse methods not supported at this time.")
+  if not isinstance(a, LinearOperator):
+    # We use the identity (B^HA^H)^H =  AB
+    adjoint_matmul = b.matmul(
+        a,
+        adjoint=(not adjoint_b),
+        adjoint_arg=(not adjoint_a),
+        name=name)
+    return linalg.adjoint(adjoint_matmul)
+  return a.matmul(
+      b, adjoint=adjoint_a, adjoint_arg=adjoint_b, name=name)
+
+
+@dispatch.dispatch_for_types(linalg.solve, LinearOperator)
+def _solve(
+    matrix,
+    rhs,
+    adjoint=False,
+    name=None):
+  if not isinstance(matrix, LinearOperator):
+    raise ValueError("Passing in `matrix` as a Tensor and `rhs` as a "
+                     "LinearOperator is not supported.")
+  return matrix.solve(rhs, adjoint=adjoint, name=name)
+
+
+@dispatch.dispatch_for_types(linalg.trace, LinearOperator)
+def _trace(x, name=None):
+  return x.trace(name)
+
+
+
diff --git a/tensorflow/python/ops/linalg/linear_operator_identity.py b/tensorflow/python/ops/linalg/linear_operator_identity.py
index 5fc3d82ee45..1b019158023 100644
--- a/tensorflow/python/ops/linalg/linear_operator_identity.py
+++ b/tensorflow/python/ops/linalg/linear_operator_identity.py
@@ -389,7 +389,7 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
     Returns:
       A `Tensor` with broadcast shape and same `dtype` as `self`.
     """
-    with self._name_scope(name, values=[mat]):
+    with self._name_scope(name):
       mat = ops.convert_to_tensor(mat, name="mat")
       mat_diag = array_ops.matrix_diag_part(mat)
       new_diag = 1 + mat_diag
@@ -708,7 +708,7 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
     Returns:
       A `Tensor` with broadcast shape and same `dtype` as `self`.
     """
-    with self._name_scope(name, values=[mat]):
+    with self._name_scope(name):
       # Shape [B1,...,Bb, 1]
       multiplier_vector = array_ops.expand_dims(self.multiplier, -1)
 
diff --git a/tensorflow/python/ops/losses/BUILD b/tensorflow/python/ops/losses/BUILD
index 26ab682f920..0e19d75fe90 100644
--- a/tensorflow/python/ops/losses/BUILD
+++ b/tensorflow/python/ops/losses/BUILD
@@ -42,6 +42,7 @@ py_test(
     name = "util_test",
     size = "small",
     srcs = ["util_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":losses",
diff --git a/tensorflow/python/ops/losses/loss_reduction.py b/tensorflow/python/ops/losses/loss_reduction.py
index 2a07c8eb57f..483a325570b 100644
--- a/tensorflow/python/ops/losses/loss_reduction.py
+++ b/tensorflow/python/ops/losses/loss_reduction.py
@@ -28,7 +28,10 @@ class ReductionV2(object):
      used with `tf.distribute.Strategy`, outside of built-in training loops such
      as `tf.keras` `compile` and `fit`, we expect reduction value to be
      `SUM` or `NONE`. Using `AUTO` in that case will raise an error.
-  * `NONE`: Un-reduced weighted losses with the same shape as input.
+  * `NONE`: Un-reduced weighted losses with the same shape as input. When this
+    reduction type used with built-in Keras training loops like
+    `fit`/`evaluate`, the unreduced vector loss is passed to the optimizer but
+    the reported loss will be a scalar value.
   * `SUM`: Scalar sum of weighted losses.
   * `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in losses.
      This reduction type is not supported when used with
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index 3e4443c95b2..d848fe49730 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -470,7 +470,7 @@ def _SqrtGradGrad(op, grad):
   a = op.inputs[0]
   y = op.outputs[0]  # y = 0.5 * b / conj(a)
   with ops.control_dependencies([grad]):
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       ga = gen_math_ops.xdivy(grad, a)
       return -gen_math_ops.mul_no_nan(y, math_ops.conj(ga)), 0.5 * ga
     else:
@@ -504,7 +504,7 @@ def _ExpGrad(op, grad):
   y = op.outputs[0]  # y = e^x
   with ops.control_dependencies([grad]):
     y = math_ops.conj(y)
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       return math_ops.mul_no_nan(y, grad)
     else:
       return grad * y
@@ -517,7 +517,7 @@ def _Expm1Grad(op, grad):
   with ops.control_dependencies([grad]):
     x = math_ops.conj(x)
     y = math_ops.exp(x)
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       return math_ops.mul_no_nan(y, grad)
     else:
       return grad * y
@@ -529,7 +529,7 @@ def _LogGrad(op, grad):
   x = op.inputs[0]
   with ops.control_dependencies([grad]):
     x = math_ops.conj(x)
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       return gen_math_ops.xdivy(grad, x)
     else:
       return grad * math_ops.reciprocal(x)
@@ -541,7 +541,7 @@ def _Log1pGrad(op, grad):
   x = op.inputs[0]
   with ops.control_dependencies([grad]):
     x = math_ops.conj(x)
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       return gen_math_ops.xdivy(grad, 1 + x)
     else:
       return grad * math_ops.reciprocal(1 + x)
@@ -623,7 +623,7 @@ def _AcoshGrad(op, grad):
   y = op.outputs[0]
   with ops.control_dependencies([grad]):
     y = math_ops.conj(y)
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       return math_ops.xdivy(grad, math_ops.sinh(y))
     else:
       return grad / math_ops.sinh(y)
@@ -676,7 +676,7 @@ def _LgammaGrad(op, grad):
   x = op.inputs[0]
   with ops.control_dependencies([grad]):
     x = math_ops.conj(x)
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       return math_ops.mul_no_nan(math_ops.digamma(x), grad)
     else:
       return grad * math_ops.digamma(x)
@@ -689,7 +689,7 @@ def _DigammaGrad(op, grad):
   with ops.control_dependencies([grad]):
     x = math_ops.conj(x)
     partial_x = math_ops.polygamma(array_ops.constant(1, dtype=x.dtype), x)
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       return math_ops.mul_no_nan(partial_x, grad)
     else:
       return grad * partial_x
@@ -702,7 +702,7 @@ def _BesselI0eGrad(op, grad):
   y = op.outputs[0]
   with ops.control_dependencies([grad]):
     partial_x = (math_ops.bessel_i1e(x) - math_ops.sign(x) * y)
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       return math_ops.mul_no_nan(partial_x, grad)
     else:
       return grad * partial_x
@@ -726,7 +726,7 @@ def _BesselI1eGrad(op, grad):
     dy_dx = math_ops.bessel_i0e(safe_x) - y * (
         math_ops.sign(safe_x) + math_ops.reciprocal(safe_x))
     dy_dx = array_ops.where(x_is_not_tiny, dy_dx, 0.5 + zeros)
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       return math_ops.mul_no_nan(dy_dx, grad)
     else:
       return grad * dy_dx
@@ -747,7 +747,7 @@ def _IgammaGrad(op, grad):
     # and Gamma'(a) can grow large.
     partial_x = math_ops.exp(-x + (a - 1) * math_ops.log(x) -
                              math_ops.lgamma(a))
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       return (array_ops.reshape(
           math_ops.reduce_sum(math_ops.mul_no_nan(partial_a, grad), ra), sa),
               array_ops.reshape(
@@ -786,7 +786,7 @@ def _BetaincGrad(op, grad):
                            (a - 1) * math_ops.log(x) - log_beta)
 
   # TODO(b/36815900): Mark None return values as NotImplemented
-  if compat.forward_compatible(2019, 5, 14):
+  if compat.forward_compatible(2019, 6, 14):
     return (
         None,  # da
         None,  # db
@@ -815,7 +815,7 @@ def _ZetaGrad(op, grad):
     q = math_ops.conj(q)
     partial_q = -x * math_ops.zeta(x + 1, q)
     # TODO(b/36815900): Mark None return values as NotImplemented
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       return (None,
               array_ops.reshape(
                   math_ops.reduce_sum(math_ops.mul_no_nan(partial_q, grad), rq),
@@ -841,7 +841,7 @@ def _PolygammaGrad(op, grad):
     x = math_ops.conj(x)
     partial_x = math_ops.polygamma(n + 1, x)
     # TODO(b/36815900): Mark None return values as NotImplemented
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       return (None,
               array_ops.reshape(
                   math_ops.reduce_sum(math_ops.mul_no_nan(partial_x, grad), rx),
@@ -902,7 +902,7 @@ def _TanGrad(op, grad):
     x = math_ops.conj(x)
     secx = math_ops.reciprocal(math_ops.cos(x))
     secx2 = math_ops.square(secx)
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       return math_ops.mul_no_nan(secx2, grad)
     else:
       return secx2 * grad
@@ -917,7 +917,7 @@ def _AsinGrad(op, grad):
     x2 = math_ops.square(x)
     one = constant_op.constant(1, dtype=grad.dtype)
     den = math_ops.sqrt(math_ops.subtract(one, x2))
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       return math_ops.xdivy(grad, den)
     else:
       inv = math_ops.reciprocal(den)
@@ -933,7 +933,7 @@ def _AcosGrad(op, grad):
     x2 = math_ops.square(x)
     one = constant_op.constant(1, dtype=grad.dtype)
     den = math_ops.sqrt(math_ops.subtract(one, x2))
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       return -math_ops.xdivy(grad, den)
     else:
       inv = math_ops.reciprocal(den)
@@ -958,7 +958,7 @@ def _Atan2Grad(op, grad):
   y = op.inputs[0]
   x = op.inputs[1]
   with ops.control_dependencies([grad]):
-    if compat.forward_compatible(2019, 5, 14):
+    if compat.forward_compatible(2019, 6, 14):
       grad_inv = math_ops.xdivy(grad, (math_ops.square(x) + math_ops.square(y)))
     else:
       grad_inv = grad / (math_ops.square(x) + math_ops.square(y))
@@ -1078,7 +1078,7 @@ def _DivGrad(op, grad):
   rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   x = math_ops.conj(x)
   y = math_ops.conj(y)
-  if compat.forward_compatible(2019, 5, 14):
+  if compat.forward_compatible(2019, 6, 14):
     return (array_ops.reshape(
         math_ops.reduce_sum(math_ops.xdivy(grad, y), rx), sx),
             array_ops.reshape(
@@ -1131,7 +1131,7 @@ def _RealDivGrad(op, grad):
   rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   x = math_ops.conj(x)
   y = math_ops.conj(y)
-  if compat.forward_compatible(2019, 5, 14):
+  if compat.forward_compatible(2019, 6, 14):
     return (array_ops.reshape(
         math_ops.reduce_sum(math_ops.xdivy(grad, y), rx), sx),
             array_ops.reshape(
@@ -1158,7 +1158,7 @@ def _DivNoNanGrad(op, grad):
   rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   x = math_ops.conj(x)
   y = math_ops.conj(y)
-  if compat.forward_compatible(2019, 5, 14):
+  if compat.forward_compatible(2019, 6, 14):
     return (array_ops.reshape(
         math_ops.reduce_sum(math_ops.div_no_nan(grad, y), rx), sx),
             array_ops.reshape(
@@ -1188,7 +1188,7 @@ def _PowGrad(op, grad):
   y = math_ops.conj(y)
   z = math_ops.conj(z)
 
-  if compat.forward_compatible(2019, 5, 14):
+  if compat.forward_compatible(2019, 6, 14):
     gx = array_ops.reshape(
         math_ops.reduce_sum(
             gen_math_ops.mul_no_nan(y * math_ops.pow(x, y - 1), grad), rx), sx)
@@ -1204,7 +1204,7 @@ def _PowGrad(op, grad):
     mask = x > 0
   safe_x = array_ops.where(mask, x, array_ops.ones_like(x))
   log_x = array_ops.where(mask, math_ops.log(safe_x), array_ops.zeros_like(x))
-  if compat.forward_compatible(2019, 5, 14):
+  if compat.forward_compatible(2019, 6, 14):
     gy = array_ops.reshape(
         math_ops.reduce_sum(gen_math_ops.mul_no_nan(z * log_x, grad), ry), sy)
   else:
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index b83c5ff7afa..224b640b97f 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -2456,6 +2456,7 @@ def trace(x, name=None):
 
 
 @tf_export("linalg.matmul", "matmul")
+@dispatch.add_dispatch_support
 def matmul(a,
            b,
            transpose_a=False,
diff --git a/tensorflow/python/ops/parallel_for/BUILD b/tensorflow/python/ops/parallel_for/BUILD
index 001ae33a5e1..82ab32ac0d4 100644
--- a/tensorflow/python/ops/parallel_for/BUILD
+++ b/tensorflow/python/ops/parallel_for/BUILD
@@ -81,8 +81,12 @@ py_library(
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:function",
     ],
 )
 
diff --git a/tensorflow/python/ops/parallel_for/array_test.py b/tensorflow/python/ops/parallel_for/array_test.py
index e1b1e0917df..9568a07fd20 100644
--- a/tensorflow/python/ops/parallel_for/array_test.py
+++ b/tensorflow/python/ops/parallel_for/array_test.py
@@ -23,6 +23,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
@@ -122,6 +123,31 @@ class ArrayTest(PForTestCase):
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
 
+  def test_one_hot(self):
+    indices = random_ops.random_uniform(
+        [3, 2, 3], minval=0, maxval=4, dtype=dtypes.int32)
+
+    def loop_fn(i):
+      indices_i = array_ops.gather(indices, i)
+      return (array_ops.one_hot(indices_i, depth=4, on_value=2., off_value=-2.),
+              array_ops.one_hot(indices_i, depth=4, axis=1))
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
+
+  def test_searchsorted(self):
+    sorted_inputs = math_ops.cumsum(random_ops.random_uniform([3, 2, 4]),
+                                    axis=-1)
+    values = random_ops.random_uniform([2, 3], minval=-1, maxval=4.5)
+
+    def loop_fn(i):
+      inputs_i = array_ops.gather(sorted_inputs, i)
+      return [array_ops.searchsorted(inputs_i, values, out_type=dtypes.int32,
+                                     side="left"),  # creates LowerBound op.
+              array_ops.searchsorted(inputs_i, values, out_type=dtypes.int64,
+                                     side="right")]  # creates UpperBound op.
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32, dtypes.int64])
+
   def test_slice(self):
     x = random_ops.random_uniform([3, 2, 3])
 
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops.py b/tensorflow/python/ops/parallel_for/control_flow_ops.py
index 5258d6a721a..89df51a42e4 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops.py
@@ -35,6 +35,7 @@ from tensorflow.python.ops.parallel_for.pfor import PForConfig
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.tf_export import tf_export
 
 
 def for_loop(loop_fn, loop_fn_dtypes, iters, parallel_iterations=None):
@@ -259,3 +260,79 @@ def _pfor_impl(loop_fn, iters, parallel_iterations=None, pfor_config=None):
       else:
         outputs = tiled_outputs
       return nest.pack_sequence_as(loop_fn_outputs, nest.flatten(outputs))
+
+
+@tf_export("vectorized_map")
+def vectorized_map(fn, elems):
+  """Parallel map on the list of tensors unpacked from `elems` on dimension 0.
+
+
+  This method works similar to tf.map_fn but is optimized to run much faster,
+  but possibly with a much larger memory footprint. The speedups are obtained by
+  vectorization (see https://arxiv.org/pdf/1903.04243.pdf). The idea behind
+  vectorization is to semantically launch all the invocations of `fn` in
+  parallel and fuse corresponding operations across all these invocations. This
+  fusion is done statically at graph generation time and the generated code is
+  often similar in performance to a manually fused version.
+
+
+  For example, let's look at a method that calculates the outer product of a
+  matrix.
+
+  ```python
+  def outer_product(a):
+    return tf.tensordot(a, a, 0)
+
+  # outer_product was designed to not support batching.
+  c = outer_product(tf.ones((2, 3)))
+  # The shape is consistent
+  assert c.shape == (2, 3, 2, 3)
+  ```
+
+  Now suppose we want an efficient batched version of outer_product. We can
+  simply write:
+
+  ```python
+  batch_size = 100
+  a = tf.ones((batch_size, 32, 32))
+  c = tf.vectorized_map(outer_product, a)
+  assert c.shape == (batch_size, 32, 32, 32, 32)
+   ```
+
+  Because `tf.vectorized_map` fully parallelizes the batch, this method will
+  generally be significantly faster than using `tf.map_fn`, especially in eager
+  mode.
+
+  This is an experimental feature and currently has a lot of limitations:
+    - There should be no data dependency between the different semantic
+      invocations of `fn`, i.e. it should be safe to map the elements of the
+      inputs in any order.
+    - Stateful kernels may mostly not be supported since these often imply a
+      data dependency. We do support a limited set of such stateful kernels
+      though (like RandomFoo, Variable operations like reads, etc).
+    - `fn` has limited support for control flow operations. `tf.cond` in
+      particular is not supported.
+    - `fn` should return nested structure of Tensors or Operations. However
+      if an Operation is returned, it should have zero outputs.
+    - The shape and dtype of `fn` outputs should not depend on the input
+      to `fn`.
+
+  Args:
+    fn: The callable to be performed. It accepts one argument, which will have
+      the same (possibly nested) structure as `elems`, and returns a possibly
+      nested structure of Tensors and Operations, which may be different than
+      the structure of `elems`.
+    elems: A tensor or (possibly nested) sequence of tensors, each of which will
+      be unpacked along their first dimension. The nested sequence of the
+      resulting slices will be mapped over by `fn`.
+
+  Returns:
+    A tensor or (possibly nested) sequence of tensors. Each tensor packs the
+    results of applying fn to tensors unpacked from elems along the first
+    dimension, from first to last.
+  """
+  def loop_fn(i):
+    gathered_elems = nest.map_structure(lambda x: array_ops.gather(x, i), elems)
+    return fn(gathered_elems)
+  batch_size = array_ops.shape(nest.flatten(elems)[0])[0]
+  return pfor(loop_fn, batch_size)
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index 901ff243d60..ab98ef0e44b 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -101,6 +101,12 @@ class PForTest(PForTestCase):
     with self.assertRaisesRegexp(ValueError, "Use for_loop instead"):
       pfor_control_flow_ops.pfor(lambda i: 1, 8, parallel_iterations=1)
 
+  def test_vectorized_map(self):
+    def compute(x):
+      return math_ops.reduce_mean(x, axis=0, keepdims=True)
+    result = pfor_control_flow_ops.vectorized_map(
+        compute, array_ops.ones((10, 5, 3)))
+    self.run_and_assert_equal(result, array_ops.ones((10, 1, 3)))
 
 @test_util.run_all_in_graph_and_eager_modes
 class ReductionTest(PForTestCase):
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index 4de392b55f0..1799e7f51a0 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -34,6 +34,7 @@ from tensorflow.python.ops import bitwise_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gen_parsing_ops
 from tensorflow.python.ops import gen_random_ops
@@ -1710,6 +1711,18 @@ def _convert_expanddims(pfor_input):
   return wrap(array_ops.expand_dims(t, axis=dim), True)
 
 
+@RegisterPForWithArgs("LowerBound", gen_array_ops.lower_bound)
+@RegisterPForWithArgs("UpperBound", gen_array_ops.upper_bound)
+def _convert_searchsorted(pfor_input, _, op_func):
+  pfor_input.stack_inputs()
+  sorted_inputs = _flatten_first_two_dims(pfor_input.stacked_input(0))
+  values = _flatten_first_two_dims(pfor_input.stacked_input(1))
+  out_type = pfor_input.get_attr("out_type")
+  output = op_func(sorted_inputs, values, out_type)
+  return wrap(_unflatten_first_dim(
+      output, pfor_input.pfor.loop_len_vector), True)
+
+
 @RegisterPFor("MatrixBandPart")
 def _convert_matrix_band_part(pfor_input):
   t = pfor_input.stacked_input(0)
@@ -1727,6 +1740,19 @@ def _convert_matrix_set_diag(pfor_input):
   return wrap(array_ops.matrix_set_diag(t, diag), True)
 
 
+@RegisterPFor("OneHot")
+def _convert_one_hot(pfor_input):
+  indices = pfor_input.stacked_input(0)
+  depth = pfor_input.unstacked_input(1)
+  on_value = pfor_input.unstacked_input(2)
+  off_value = pfor_input.unstacked_input(3)
+  axis = pfor_input.get_attr("axis")
+  if axis >= 0:
+    axis += 1
+  return wrap(
+      array_ops.one_hot(indices, depth, on_value, off_value, axis), True)
+
+
 @RegisterPFor("Slice")
 def _convert_slice(pfor_input):
   t = pfor_input.stacked_input(0)
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index fdbc5a6c69a..5f753041f41 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -173,10 +173,11 @@ def variable_handle_from_shape_and_dtype(
     return handle
 
 
-def eager_safe_variable_handle(initial_value, shared_name, name, graph_mode):
+def eager_safe_variable_handle(initial_value, shape, shared_name, name,
+                               graph_mode):
   """Creates a variable handle with information to do shape inference.
 
-  The shape and dtype are read from `initial_value` and stored in the returned
+  The dtype is read from `initial_value` and stored in the returned
   resource tensor's handle data.
 
   If `initial_value.dtype == tf.variant`, we additionally extract the handle
@@ -206,6 +207,8 @@ def eager_safe_variable_handle(initial_value, shared_name, name, graph_mode):
 
   Args:
     initial_value: A `Tensor`.
+    shape: The shape of the handle data. Can be `TensorShape(None)`
+      (i.e. unknown shape).
     shared_name: A string.
     name: A string.
     graph_mode: A python bool.
@@ -213,7 +216,6 @@ def eager_safe_variable_handle(initial_value, shared_name, name, graph_mode):
   Returns:
     The handle, a `Tensor` of type `resource`.
   """
-  shape = initial_value.get_shape()
   dtype = initial_value.dtype.base_dtype
   return variable_handle_from_shape_and_dtype(
       shape, dtype, shared_name, name, graph_mode, initial_value)
@@ -369,7 +371,8 @@ class ResourceVariable(variables.VariableV1):
                constraint=None,
                distribute_strategy=None,
                synchronization=None,
-               aggregation=None):
+               aggregation=None,
+               shape=None):
     """Creates a variable.
 
     Args:
@@ -418,6 +421,10 @@ class ResourceVariable(variables.VariableV1):
       aggregation: Indicates how a distributed variable will be aggregated.
         Accepted values are constants defined in the class
         `tf.VariableAggregation`.
+      shape: (optional) The shape of this variable. If None, the shape of
+        `initial_value` will be used. When setting this argument to
+        `tf.TensorShape(None)` (representing an unspecified shape), the variable
+        can be assigned with values of different shapes.
 
     Raises:
       ValueError: If the initial value is not specified, or does not have a
@@ -448,7 +455,8 @@ class ResourceVariable(variables.VariableV1):
           dtype=dtype,
           constraint=constraint,
           synchronization=synchronization,
-          aggregation=aggregation)
+          aggregation=aggregation,
+          shape=shape)
 
   def __repr__(self):
     if context.executing_eagerly() and not self._in_graph_mode:
@@ -468,7 +476,8 @@ class ResourceVariable(variables.VariableV1):
                       dtype=None,
                       constraint=None,
                       synchronization=None,
-                      aggregation=None):
+                      aggregation=None,
+                      shape=None):
     """Creates a variable.
 
     Args:
@@ -510,6 +519,10 @@ class ResourceVariable(variables.VariableV1):
       aggregation: Indicates how a distributed variable will be aggregated.
         Accepted values are constants defined in the class
         `tf.VariableAggregation`.
+      shape: (optional) The shape of this variable. If None, the shape of
+        `initial_value` will be used. When setting this argument to
+        `tf.TensorShape(None)` (representing an unspecified shape), the variable
+        can be assigned with values of different shapes.
 
     Raises:
       ValueError: If the initial value is not specified, or does not have a
@@ -588,12 +601,15 @@ class ResourceVariable(variables.VariableV1):
             initial_value = ops.convert_to_tensor(
                 initial_value() if init_from_fn else initial_value,
                 name="initial_value", dtype=dtype)
+          # Don't use `shape or initial_value.shape` since TensorShape has
+          # overridden `__bool__`.
+          self._shape = shape if shape is not None else initial_value.shape
           self._handle = eager_safe_variable_handle(
               initial_value=initial_value,
+              shape=self._shape,
               shared_name=shared_name,
               name=name,
               graph_mode=self._in_graph_mode)
-        self._shape = initial_value.shape
         # pylint: disable=protected-access
         if (self._in_graph_mode and initial_value is not None and
             initial_value.op._get_control_flow_context() is not None):
@@ -1710,24 +1726,29 @@ class UninitializedVariable(ResourceVariable):
     # Store the graph key so optimizers know how to only retrieve variables from
     # this graph. Guaranteed to be the same as the eager graph_key.
     self._graph_key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
-    self._shape = shape
-    self._dtype = dtype
+    self._shape = tensor_shape.as_shape(shape)
+    self._dtype = dtypes.as_dtype(dtype)
     with ops.init_scope():
-      handle_name = ops.name_from_scope_name(name)
-      unique_id = "%s_%d" % (handle_name, ops.uid())
-      shared_name = context.shared_name(unique_id)
-      self._handle = variable_handle_from_shape_and_dtype(
-          shape=shape, dtype=dtype, shared_name=shared_name,
-          name=name, graph_mode=self._in_graph_mode,
-          extra_handle_data=extra_handle_data)
-      if self._in_graph_mode:
-        with ops.name_scope("Read"), ops.colocate_with(self._handle):
-          # Manually assign reads to the handle's device to avoid log
-          # messages.
-          with ops.device(self._handle.device):
-            value = self._read_variable_op()
-          self._graph_element = value
-        ops.add_to_collection(ops.GraphKeys.GLOBAL_VARIABLES, self)
+      with ops.name_scope(name, "Variable") as name:
+        handle_name = ops.name_from_scope_name(name)
+        if self._in_graph_mode:
+          shared_name = handle_name
+          unique_id = shared_name
+        else:
+          unique_id = "%s_%d" % (handle_name, ops.uid())
+          shared_name = context.shared_name(unique_id)
+        self._handle = variable_handle_from_shape_and_dtype(
+            shape=shape, dtype=dtype, shared_name=shared_name,
+            name=name, graph_mode=self._in_graph_mode,
+            extra_handle_data=extra_handle_data)
+        if self._in_graph_mode:
+          with ops.name_scope("Read"), ops.colocate_with(self._handle):
+            # Manually assign reads to the handle's device to avoid log
+            # messages.
+            with ops.device(self._handle.device):
+              value = self._read_variable_op()
+            self._graph_element = value
+          ops.add_to_collection(ops.GraphKeys.GLOBAL_VARIABLES, self)
     self._unique_id = unique_id
     self._handle_name = handle_name + ":0"
     self._constraint = constraint
diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py
index ce75de72a5f..f007e1f76c5 100644
--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@@ -105,6 +105,8 @@ from tensorflow.python.ops.template import *
 from tensorflow.python.ops.tensor_array_ops import *
 from tensorflow.python.ops.variable_scope import *
 from tensorflow.python.ops.variables import *
+from tensorflow.python.ops.parallel_for.control_flow_ops import vectorized_map
+
 # pylint: enable=wildcard-import
 # pylint: enable=g-bad-import-order
 
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index aeb7b547be0..7440c3fb0f0 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -2469,6 +2469,7 @@ def default_variable_creator(next_creator=None, **kwargs):
   use_resource = kwargs.get("use_resource", None)
   synchronization = kwargs.get("synchronization", None)
   aggregation = kwargs.get("aggregation", None)
+  shape = kwargs.get("shape", None)
 
   if use_resource is None:
     use_resource = get_variable_scope().use_resource
@@ -2490,7 +2491,8 @@ def default_variable_creator(next_creator=None, **kwargs):
         import_scope=import_scope,
         distribute_strategy=distribute_strategy,
         synchronization=synchronization,
-        aggregation=aggregation)
+        aggregation=aggregation,
+        shape=shape)
   else:
     return variables.RefVariable(
         initial_value=initial_value,
@@ -2505,7 +2507,8 @@ def default_variable_creator(next_creator=None, **kwargs):
         expected_shape=expected_shape,
         import_scope=import_scope,
         synchronization=synchronization,
-        aggregation=aggregation)
+        aggregation=aggregation,
+        shape=shape)
 
 
 def default_variable_creator_v2(next_creator=None, **kwargs):
@@ -2523,6 +2526,7 @@ def default_variable_creator_v2(next_creator=None, **kwargs):
   distribute_strategy = kwargs.get("distribute_strategy", None)
   synchronization = kwargs.get("synchronization", None)
   aggregation = kwargs.get("aggregation", None)
+  shape = kwargs.get("shape", None)
 
   return resource_variable_ops.ResourceVariable(
       initial_value=initial_value,
@@ -2536,7 +2540,8 @@ def default_variable_creator_v2(next_creator=None, **kwargs):
       import_scope=import_scope,
       distribute_strategy=distribute_strategy,
       synchronization=synchronization,
-      aggregation=aggregation)
+      aggregation=aggregation,
+      shape=shape)
 
 
 variables.default_variable_creator = default_variable_creator
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 6970bfc5b18..627ef54aa09 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -192,7 +192,8 @@ class VariableMetaclass(type):
                         constraint=None,
                         use_resource=None,
                         synchronization=VariableSynchronization.AUTO,
-                        aggregation=VariableAggregation.NONE):
+                        aggregation=VariableAggregation.NONE,
+                        shape=None):
     """Call on Variable class. Useful to force the signature."""
     previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
     for _, getter in ops.get_default_graph()._variable_creator_stack:  # pylint: disable=protected-access
@@ -215,7 +216,8 @@ class VariableMetaclass(type):
         constraint=constraint,
         use_resource=use_resource,
         synchronization=synchronization,
-        aggregation=aggregation)
+        aggregation=aggregation,
+        shape=shape)
 
   def _variable_v2_call(cls,
                         initial_value=None,
@@ -228,7 +230,8 @@ class VariableMetaclass(type):
                         import_scope=None,
                         constraint=None,
                         synchronization=VariableSynchronization.AUTO,
-                        aggregation=VariableAggregation.NONE):
+                        aggregation=VariableAggregation.NONE,
+                        shape=None):
     """Call on Variable class. Useful to force the signature."""
     previous_getter = lambda **kws: default_variable_creator_v2(None, **kws)
     for _, getter in ops.get_default_graph()._variable_creator_stack:  # pylint: disable=protected-access
@@ -248,7 +251,8 @@ class VariableMetaclass(type):
         import_scope=import_scope,
         constraint=constraint,
         synchronization=synchronization,
-        aggregation=aggregation)
+        aggregation=aggregation,
+        shape=shape)
 
   def __call__(cls, *args, **kwargs):
     if cls is VariableV1:
@@ -388,7 +392,8 @@ class Variable(six.with_metaclass(VariableMetaclass,
                import_scope=None,
                constraint=None,
                synchronization=VariableSynchronization.AUTO,
-               aggregation=VariableAggregation.NONE):
+               aggregation=VariableAggregation.NONE,
+               shape=None):
     """Creates a new variable with value `initial_value`.
 
     The new variable is added to the graph collections listed in `collections`,
@@ -444,6 +449,10 @@ class Variable(six.with_metaclass(VariableMetaclass,
       aggregation: Indicates how a distributed variable will be aggregated.
         Accepted values are constants defined in the class
         `tf.VariableAggregation`.
+      shape: (optional) The shape of this variable. If None, the shape of
+        `initial_value` will be used. When setting this argument to
+        `tf.TensorShape(None)` (representing an unspecified shape), the variable
+        can be assigned with values of different shapes.
 
     Raises:
       ValueError: If both `variable_def` and initial_value are specified.
@@ -1364,7 +1373,8 @@ class VariableV1(Variable):
                constraint=None,
                use_resource=None,
                synchronization=VariableSynchronization.AUTO,
-               aggregation=VariableAggregation.NONE):
+               aggregation=VariableAggregation.NONE,
+               shape=None):
     """Creates a new variable with value `initial_value`.
 
     The new variable is added to the graph collections listed in `collections`,
@@ -1419,6 +1429,10 @@ class VariableV1(Variable):
       use_resource: whether to use resource variables.
       synchronization: unused
       aggregation: unused
+      shape: (optional) The shape of this variable. If None, the shape of
+        `initial_value` will be used. When setting this argument to
+        `tf.TensorShape(None)` (representing an unspecified shape), the variable
+        can be assigned with values of different shapes.
 
     Raises:
       ValueError: If both `variable_def` and initial_value are specified.
@@ -1447,7 +1461,8 @@ class RefVariable(VariableV1):
                import_scope=None,
                constraint=None,
                synchronization=None,
-               aggregation=None):
+               aggregation=None,
+               shape=None):
     """Creates a new variable with value `initial_value`.
 
     The new variable is added to the graph collections listed in `collections`,
@@ -1508,6 +1523,10 @@ class RefVariable(VariableV1):
       aggregation: Indicates how a distributed variable will be aggregated.
         Accepted values are constants defined in the class
         `tf.VariableAggregation`.
+      shape: (optional) The shape of this variable. If None, the shape of
+        `initial_value` will be used. When setting this argument to
+        `tf.TensorShape(None)` (representing an unspecified shape), the variable
+        can be assigned with values of different shapes.
 
     Raises:
       ValueError: If both `variable_def` and initial_value are specified.
@@ -1535,7 +1554,8 @@ class RefVariable(VariableV1):
           expected_shape=expected_shape,
           constraint=constraint,
           synchronization=synchronization,
-          aggregation=aggregation)
+          aggregation=aggregation,
+          shape=shape)
 
   def __repr__(self):
     if context.executing_eagerly() and not self._in_graph_mode:
@@ -1557,7 +1577,8 @@ class RefVariable(VariableV1):
                       expected_shape=None,
                       constraint=None,
                       synchronization=None,
-                      aggregation=None):
+                      aggregation=None,
+                      shape=None):
     """Creates a new variable from arguments.
 
     Args:
@@ -1603,6 +1624,10 @@ class RefVariable(VariableV1):
       aggregation: Indicates how a distributed variable will be aggregated.
         Accepted values are constants defined in the class
         `tf.VariableAggregation`.
+      shape: (optional) The shape of this variable. If None, the shape of
+        `initial_value` will be used. When setting this argument to
+        `tf.TensorShape(None)` (representing an unspecified shape), the variable
+        can be assigned with values of different shapes.
 
     Raises:
       ValueError: If the initial value is not specified, or does not have a
@@ -1660,8 +1685,9 @@ class RefVariable(VariableV1):
             with ops.name_scope("Initializer"), ops.device(None):
               self._initial_value = ops.convert_to_tensor(
                   initial_value(), name="initial_value", dtype=dtype)
-              shape = (self._initial_value.get_shape()
-                       if validate_shape else tensor_shape.unknown_shape())
+              if shape is None:
+                shape = (self._initial_value.get_shape()
+                         if validate_shape else tensor_shape.unknown_shape())
             self._variable = state_ops.variable_op_v2(
                 shape,
                 self._initial_value.dtype.base_dtype,
@@ -1679,9 +1705,10 @@ class RefVariable(VariableV1):
                 "construct, such as a loop or conditional. When creating a "
                 "variable inside a loop or conditional, use a lambda as the "
                 "initializer." % name)
-          # pylint: enable=protected-access
-          shape = (self._initial_value.get_shape()
-                   if validate_shape else tensor_shape.unknown_shape())
+          if shape is None:
+            # pylint: enable=protected-access
+            shape = (self._initial_value.get_shape()
+                     if validate_shape else tensor_shape.unknown_shape())
           # In this case, the variable op can't be created until after the
           # initial_value has been converted to a Tensor with a known type.
           self._variable = state_ops.variable_op_v2(
diff --git a/tensorflow/python/platform/benchmark_test.py b/tensorflow/python/platform/benchmark_test.py
index 64b6163ce0a..17605984e70 100644
--- a/tensorflow/python/platform/benchmark_test.py
+++ b/tensorflow/python/platform/benchmark_test.py
@@ -24,7 +24,6 @@ from tensorflow.core.util import test_log_pb2
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
 
-
 class BenchmarkTest(test.TestCase, benchmark.TensorFlowBenchmark):
 
   def testReportBenchmark(self):
@@ -39,14 +38,17 @@ class BenchmarkTest(test.TestCase, benchmark.TensorFlowBenchmark):
         iters=2000,
         wall_time=1000,
         name='testReportBenchmark',
-        metrics=[{'name': 'metric_name', 'value': 99, 'min_value': 1}])
+        metrics=[{'name': 'metric_name_1', 'value': 0, 'min_value': 1},
+                 {'name': 'metric_name_2', 'value': 90, 'min_value': 0,
+                  'max_value': 95}])
 
     with open(proto_file_path, 'rb') as f:
       benchmark_entries = test_log_pb2.BenchmarkEntries()
       benchmark_entries.ParseFromString(f.read())
 
       actual_result = json_format.MessageToDict(
-          benchmark_entries, preserving_proto_field_name=True)['entry'][0]
+          benchmark_entries, preserving_proto_field_name=True,
+          including_default_value_fields=True)['entry'][0]
     os.remove(proto_file_path)
 
     expected_result = {
@@ -55,11 +57,22 @@ class BenchmarkTest(test.TestCase, benchmark.TensorFlowBenchmark):
         # int64 field to string.
         'iters': '2000',
         'wall_time': 1000,
-        'metrics': [{
-            'name': 'metric_name',
-            'value': 99,
-            'min_value': 1
-        }]
+        'cpu_time': 0,
+        'throughput': 0,
+        'extras': {},
+        'metrics': [
+            {
+                'name': 'metric_name_1',
+                'value': 0,
+                'min_value': 1
+            },
+            {
+                'name': 'metric_name_2',
+                'value': 90,
+                'min_value': 0,
+                'max_value': 95
+            }
+        ]
     }
 
     self.assertEqual(2000, benchmark_entries.entry[0].iters)
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index fcab57c12c9..f2796e43989 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -147,6 +147,7 @@ py_test(
     size = "small",
     srcs = ["pprof_profiler_test.py"],
     main = "pprof_profiler_test.py",
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_pip"],  # TODO(annarev): get it working with pip.
     deps = [
diff --git a/tensorflow/python/saved_model/load.py b/tensorflow/python/saved_model/load.py
index 17c1024fe34..a41fac4fcf4 100644
--- a/tensorflow/python/saved_model/load.py
+++ b/tensorflow/python/saved_model/load.py
@@ -26,7 +26,6 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.saved_model import function_deserialization
@@ -111,6 +110,13 @@ class _Loader(object):
       # itself.
       concrete_function._captured_inputs = bound_inputs  # pylint: disable=protected-access
       concrete_function._func_graph.variables = bound_variables  # pylint: disable=protected-access
+      if bound_inputs:
+        for bound_input, internal_capture in zip(
+            bound_inputs, concrete_function.inputs[-len(bound_inputs):]):
+          concrete_function.graph.captures[bound_input] = internal_capture
+          # Setting "captures" first means "capture" won't create a new
+          # placeholder for this input.
+          concrete_function.graph.capture(bound_input)
 
   def _get_tensor_from_node(self, node_id):
     """Resolves a node id into a tensor to be captured for a function."""
@@ -262,8 +268,6 @@ class _Loader(object):
         proto, self._concrete_functions), setattr
 
   def _recreate_variable(self, proto):
-    # TODO(andresp): Can we use the checkpointed value as initializer?
-    dummy_value = init_ops.Zeros(dtype=proto.dtype)(shape=proto.shape)
     name = proto.name if proto.name else None
     if name is not None:
       dbg_name = name
@@ -273,8 +277,9 @@ class _Loader(object):
         variables.validate_synchronization_aggregation_trainable(
             proto.synchronization, proto.aggregation, proto.trainable,
             name=dbg_name))
-    return variables.Variable(
-        dummy_value,
+    return resource_variable_ops.UninitializedVariable(
+        shape=proto.shape,
+        dtype=proto.dtype,
         name=name,
         trainable=trainable,
         synchronization=synchronization,
@@ -344,6 +349,27 @@ def load(export_dir, tags=None):
   assert 6. == imported.f(x=tf.constant(2.)).numpy()
   ```
 
+  _Importing SavedModels from TensorFlow 1.x_
+
+  SavedModels from `tf.estimator.Estimator` or 1.x SavedModel APIs have a flat
+  graph instead of `tf.function` objects. These SavedModels will have functions
+  corresponding to their signatures in the `.signatures` attribute, but also
+  have a `.prune` method which allows you to extract functions for new
+  subgraphs. This is equivalent to importing the SavedModel and naming feeds and
+  fetches in a Session from TensorFlow 1.x.
+
+  ```python
+  imported = tf.saved_model.load(path_to_v1_saved_model)
+  pruned = imported.prune("x:0", "out:0")
+  pruned(tf.ones([]))
+  ```
+
+  See `tf.compat.v1.wrap_function` for details. These SavedModels also have a
+  `.variables` attribute containing imported variables, and a `.graph` attribute
+  representing the whole imported graph. For SavedModels exported from
+  `tf.saved_model.save`, variables are instead assigned to whichever attributes
+  they were assigned before export.
+
   Args:
     export_dir: The SavedModel directory to load from.
     tags: A tag or sequence of tags identifying the MetaGraph to load. Optional
@@ -380,6 +406,9 @@ def load(export_dir, tags=None):
                        saved_model_proto,
                        export_dir)
       root = loader.get(0)
+    root.tensorflow_version = meta_graph_def.meta_info_def.tensorflow_version
+    root.tensorflow_git_version = (
+        meta_graph_def.meta_info_def.tensorflow_git_version)
   else:
     with ops.init_scope():
       root = load_v1_in_v2.load(export_dir, tags)
diff --git a/tensorflow/python/saved_model/load_test.py b/tensorflow/python/saved_model/load_test.py
index 098e2d330fd..953efc4fa2e 100644
--- a/tensorflow/python/saved_model/load_test.py
+++ b/tensorflow/python/saved_model/load_test.py
@@ -31,12 +31,15 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
+from tensorflow.python.eager import wrap_function
 from tensorflow.python.feature_column import feature_column_v2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
+from tensorflow.python.framework import versions
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.engine import sequential
@@ -47,6 +50,7 @@ from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import cond_v2
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
@@ -117,6 +121,25 @@ class LoadTest(test.TestCase, parameterized.TestCase):
       self.assertTrue(imported.v1.name.startswith("foo/"))
       self.assertTrue(imported.v2.name.startswith("foo/"))
 
+  def test_partially_defined_variable_shape(self, cycles):
+
+    class MakeVariable(module.Module):
+
+      def __init__(self):
+        self.v = None
+
+      @def_function.function(
+          input_signature=[tensor_spec.TensorSpec([None], dtypes.int64)])
+      def make_variable(self, initial_value):
+        if self.v is None:
+          self.v = variables.Variable(initial_value)
+
+    m = MakeVariable()
+    m.make_variable([1, 2, 3])
+    m = self.cycle(m, cycles)
+    m.v.assign([1, 2, 3, 4])
+    self.assertEqual([None], tensor_shape.as_shape(m.v.shape).as_list())
+
   @test_util.run_in_graph_and_eager_modes
   def test_capture_variables(self, cycles):
     root = tracking.AutoTrackable()
@@ -191,6 +214,36 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     with open(self.evaluate(imported.asset2.asset_path), "r") as f:
       self.assertEqual("contents 2", f.read())
 
+  def test_cond_prune(self, cycles):
+    x_in = []
+    x_out = []
+
+    def f(x, y):
+      x_in.append(x)
+      xx = cond_v2.cond_v2(
+          math_ops.less(1, 2),
+          lambda: x + 1,
+          lambda: x + 2,
+      )
+      x_out.append(xx)
+      return xx, 2 * y
+
+    f_wrapped = wrap_function.wrap_function(
+        f, [tensor_spec.TensorSpec((), dtypes.float32)] * 2)
+    f_pruned = f_wrapped.prune(x_in[0], [x_out[0]])
+
+    class Adder(module.Module):
+
+      @def_function.function(input_signature=[
+          tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32)])
+      def add(self, x):
+        return f_pruned(x)
+
+    root = Adder()
+    root.add(constant_op.constant(1.))
+    root = self.cycle(root, cycles)
+    root.add(constant_op.constant(1.))
+
   def test_capture_assets(self, cycles):
     root = tracking.AutoTrackable()
     root.vocab = tracking.TrackableAsset(self._make_asset("contents"))
@@ -794,6 +847,33 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual([2, 4, 6],
                         imported.f(constant_op.constant([1, 2, 3])).numpy())
 
+  def test_concrete_function_captures(self, cycles):
+
+    class Root(module.Module):
+
+      def __init__(self):
+        self.v = variables.Variable(1.)
+        self.v1 = variables.Variable(1.)
+
+      @def_function.function(
+          input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+      def use_v(self, x):
+        return self.v + self.v1 + 1.
+
+    root = Root()
+    self.assertIn(root.v.handle,
+                  root.use_v.get_concrete_function().graph.captures)
+    for _ in range(cycles):
+      root = self.cycle(root, 1, signatures=root.use_v.get_concrete_function())
+    func_captures = root.use_v.get_concrete_function().graph.captures
+    self.assertLen(func_captures, 2)
+    self.assertIn(root.v.handle, func_captures)
+    self.assertIn(root.v1.handle, func_captures)
+    signature_captures = root.signatures["serving_default"].graph.captures
+    self.assertLen(signature_captures, 2)
+    self.assertIn(root.v.handle, signature_captures)
+    self.assertIn(root.v1.handle, signature_captures)
+
   def test_concrete_function_arg_names(self, cycles):
 
     @def_function.function(
@@ -1489,6 +1569,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
         3 * (1 + 4 + 9 + 16),
         root(constant_op.constant(3, dtype=dtypes.int64)).numpy())
 
+  @test_util.run_in_graph_and_eager_modes
   def test_dense_features_layer(self, cycles):
     columns = [feature_column_v2.numeric_column("x"),
                feature_column_v2.numeric_column("y")]
@@ -1496,7 +1577,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     model = sequential.Sequential([layer])
     model_input = {"x": constant_op.constant([[1.]]),
                    "y": constant_op.constant([[2.]])}
-    self.assertAllClose([[1., 2.]], model.predict(model_input))
+    self.assertAllClose([[1., 2.]], model.predict(model_input, steps=1))
     loaded = self.cycle(model, cycles)
     output, = loaded._default_save_signature(model_input).values()
     self.assertAllClose([[1., 2.]], output)
@@ -1532,6 +1613,16 @@ class LoadTest(test.TestCase, parameterized.TestCase):
         dict(out=2., out_1=3.),
         loaded.signatures["serving_default"](constant_op.constant(1.)))
 
+  def test_tuple_signature(self, cycles):
+    root = util.Checkpoint()
+    root.f = def_function.function(
+        lambda: (array_ops.ones([]), array_ops.zeros([])),
+        input_signature=())
+    for _ in range(cycles):
+      root = self.cycle(root, 1, signatures=root.f)
+    self.assertEqual(({"output_0": 1., "output_1": 0.}),
+                     self.evaluate(root.signatures["serving_default"]()))
+
   def test_model_with_custom_function_attached(self, cycles):
     root = util.Checkpoint(model=sequential.Sequential([core.Dense(2)]))
 
@@ -1547,6 +1638,12 @@ class LoadTest(test.TestCase, parameterized.TestCase):
         original,
         root.model.traced_call(array_ops.zeros([1, 1])).numpy())
 
+  def test_version_info(self, cycles):
+    root = util.Checkpoint()
+    root = self.cycle(root, cycles)
+    self.assertEqual(versions.__version__, root.tensorflow_version)
+    self.assertEqual(versions.__git_version__, root.tensorflow_git_version)
+
   def test_functional_model_with_conv(self, cycles):
     x = input_layer.Input(name="x", shape=(None, None, 3), dtype=dtypes.float32)
     conved = convolutional.Conv2D(filters=3, kernel_size=3, dilation_rate=2)(x)
diff --git a/tensorflow/python/saved_model/load_v1_in_v2.py b/tensorflow/python/saved_model/load_v1_in_v2.py
index 4d0ef7ba89f..d375584205e 100644
--- a/tensorflow/python/saved_model/load_v1_in_v2.py
+++ b/tensorflow/python/saved_model/load_v1_in_v2.py
@@ -20,10 +20,12 @@ from __future__ import print_function
 
 import functools
 
+from tensorflow.python.eager import context
 from tensorflow.python.eager import lift_to_graph
 from tensorflow.python.eager import wrap_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.saved_model import loader_impl
 from tensorflow.python.saved_model import signature_serialization
@@ -56,7 +58,7 @@ class _Initializer(tracking.CapturableResource):
         dtype=dtypes.resource, shape=[], name="unused_resource")
 
   def _initialize(self):
-    self._init_fn(*[path.asset_path for path in self._asset_paths])
+    return self._init_fn(*[path.asset_path for path in self._asset_paths])
 
 
 class _EagerSavedModelLoader(loader_impl.SavedModelLoader):
@@ -90,11 +92,21 @@ class _EagerSavedModelLoader(loader_impl.SavedModelLoader):
     """Restores variables from the checkpoint."""
     if saver is not None:
       saver_def = saver.saver_def
+      filename_tensor = wrapped.graph.as_graph_element(
+          saver_def.filename_tensor_name)
+      # We both feed and fetch filename_tensor so we have an operation to use to
+      # feed into variable initializers (only relevant for v1 graph building).
       restore_fn = wrapped.prune(
-          feeds=[wrapped.graph.as_graph_element(
-              saver_def.filename_tensor_name)],
-          fetches=[wrapped.graph.as_graph_element(saver_def.restore_op_name)])
-      restore_fn(constant_op.constant(self._variables_path))
+          feeds=[filename_tensor],
+          fetches=[filename_tensor,
+                   wrapped.graph.as_graph_element(saver_def.restore_op_name)])
+      initializer, _ = restore_fn(constant_op.constant(self._variables_path))
+      if not ops.executing_eagerly_outside_functions():
+        for variable in wrapped.graph.get_collection_ref(
+            ops.GraphKeys.GLOBAL_VARIABLES):
+          # pylint: disable=protected-access
+          variable._initializer_op = initializer
+          # pylint: enable=protected-access
 
   def _extract_signatures(self, wrapped, meta_graph_def):
     """Creates ConcreteFunctions for signatures in `meta_graph_def`."""
@@ -106,10 +118,8 @@ class _EagerSavedModelLoader(loader_impl.SavedModelLoader):
         input_names = []
         input_specs = []
       # TODO(allenl): Support optional arguments
-      feeds = [wrapped.graph.as_graph_element(inp.name)
-               for inp in input_specs]
-      fetches = {name: wrapped.graph.as_graph_element(out.name)
-                 for name, out in signature_def.outputs.items()}
+      feeds = [wrapped.graph.as_graph_element(inp.name) for inp in input_specs]
+      fetches = {name: out for name, out in signature_def.outputs.items()}
       try:
         signature_fn = wrapped.prune(feeds=feeds, fetches=fetches)
       except lift_to_graph.UnliftableError as ex:
@@ -151,6 +161,8 @@ class _EagerSavedModelLoader(loader_impl.SavedModelLoader):
     with wrapped.graph.as_default():
       init_op = loader_impl.get_init_op(
           meta_graph_def) or monitored_session.Scaffold.default_local_init_op()
+      # Add a dummy Tensor we know we can fetch to add control dependencies to.
+      init_anchor = constant_op.constant(0., name="dummy_fetch")
 
     root = tracking.AutoTrackable()
     asset_feed_tensors = []
@@ -161,9 +173,19 @@ class _EagerSavedModelLoader(loader_impl.SavedModelLoader):
       asset_paths.append(tracking.TrackableAsset(value))
     init_fn = wrapped.prune(
         feeds=asset_feed_tensors,
-        fetches=[wrapped.graph.as_graph_element(init_op)])
+        fetches=[init_anchor, wrapped.graph.as_graph_element(init_op)])
     initializer = _Initializer(init_fn, asset_paths)
-    initializer._initialize()  # pylint: disable=protected-access
+    # pylint: disable=protected-access
+    local_init_op, _ = initializer._initialize()
+    # pylint: enable=protected-access
+    with ops.init_scope():
+      if not context.executing_eagerly():
+        ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, local_init_op)
+        for variable in wrapped.graph.get_collection_ref(
+            ops.GraphKeys.LOCAL_VARIABLES):
+          # pylint: disable=protected-access
+          variable._initializer_op = local_init_op
+          # pylint: enable=protected-access
     root.initializer = initializer
     root.asset_paths = asset_paths
     signature_functions = self._extract_signatures(wrapped, meta_graph_def)
@@ -171,6 +193,12 @@ class _EagerSavedModelLoader(loader_impl.SavedModelLoader):
     root.signatures = signature_serialization.create_signature_map(
         signature_functions)
     root.variables = list(wrapped.graph.variables)
+    root.tensorflow_version = (
+        meta_graph_def.meta_info_def.tensorflow_version)
+    root.tensorflow_git_version = (
+        meta_graph_def.meta_info_def.tensorflow_git_version)
+    root.graph = wrapped.graph
+    root.prune = wrapped.prune
     return root
 
 
diff --git a/tensorflow/python/saved_model/load_v1_in_v2_test.py b/tensorflow/python/saved_model/load_v1_in_v2_test.py
index b6a1c9d0c47..8c64413a42c 100644
--- a/tensorflow/python/saved_model/load_v1_in_v2_test.py
+++ b/tensorflow/python/saved_model/load_v1_in_v2_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import os
 import shutil
 
+from tensorflow.core.framework import variable_pb2
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import lift_to_graph
@@ -28,6 +29,9 @@ from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.framework import versions
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -50,7 +54,7 @@ class LoadTest(test.TestCase):
     export_graph = ops.Graph()
     with export_graph.as_default():
       start = array_ops.placeholder(
-          shape=[None], dtype=dtypes.float32, name="start")
+          shape=None, dtype=dtypes.float32, name="start")
       if use_resource:
         distractor = variables.RefVariable(-1., name="distractor")
         v = resource_variable_ops.ResourceVariable(3., name="v")
@@ -80,17 +84,20 @@ class LoadTest(test.TestCase):
             legacy_init_op=local_variable.initializer)
     return path
 
+  @test_util.run_in_graph_and_eager_modes
   def test_resource_variable_import(self):
     imported = load.load(self._v1_single_metagraph_saved_model(
         use_resource=True))
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(variables.local_variables_initializer())
     fn = imported.signatures["serving_default"]
     self.assertEqual({"output": 6.},
                      self.evaluate(fn(constant_op.constant(2.))))
     self.assertAllEqual([3., 1.], self.evaluate(imported.variables))
-    imported.variables[0].assign(4.)
+    self.evaluate(imported.variables[0].assign(4.))
     self.assertEqual({"output": 8.},
                      self.evaluate(fn(start=constant_op.constant(2.))))
-    imported.variables[1].assign(2.)
+    self.evaluate(imported.variables[1].assign(2.))
     self.assertEqual({"output": 24.},
                      self.evaluate(fn(start=constant_op.constant(3.))))
     self.assertTrue(imported.variables[0].trainable)
@@ -98,7 +105,9 @@ class LoadTest(test.TestCase):
     with backprop.GradientTape() as tape:
       output = fn(start=constant_op.constant(4.))
     self.assertEqual(imported.variables[:1], list(tape.watched_variables()))
-    self.assertEqual(8., tape.gradient(output, imported.variables[0]).numpy())
+    self.assertEqual(
+        8.,
+        self.evaluate(tape.gradient(output, imported.variables[0])))
 
   def test_ref_variable_import(self):
     saved = self._v1_single_metagraph_saved_model(use_resource=False)
@@ -106,6 +115,34 @@ class LoadTest(test.TestCase):
     fn = imported.signatures["serving_default"]
     self.assertEqual(6., fn(start=constant_op.constant(2.))["output"].numpy())
 
+  def _v1_output_shape_saved_model(self):
+    export_graph = ops.Graph()
+    with export_graph.as_default():
+      start = array_ops.placeholder(
+          shape=[None], dtype=dtypes.float32, name="start")
+      output = array_ops.identity(start, name="output")
+      output.set_shape([1])  # Ok to use [1] because shape is only informational
+      with session_lib.Session() as session:
+        path = os.path.join(self.get_temp_dir(), "saved_model", str(ops.uid()))
+        builder = builder_impl.SavedModelBuilder(path)
+        builder.add_meta_graph_and_variables(
+            session,
+            tags=[tag_constants.SERVING],
+            signature_def_map={
+                "serving_default":
+                    signature_def_utils.build_signature_def(
+                        {"start": utils_impl.build_tensor_info(start)},
+                        {"output": utils_impl.build_tensor_info(output)})
+            })
+        builder.save()
+    return path
+
+  def test_restore_output_shapes(self):
+    saved = self._v1_output_shape_saved_model()
+    imported = load.load(saved)
+    fn = imported.signatures["serving_default"]
+    self.assertEqual(tensor_shape.TensorShape([1]), fn.outputs[0].shape)
+
   def _v1_multi_metagraph_saved_model(self):
     export_graph = ops.Graph()
     with export_graph.as_default():
@@ -184,9 +221,11 @@ class LoadTest(test.TestCase):
     file_io.delete_file(vocab_path)
     return path
 
+  @test_util.run_in_graph_and_eager_modes
   def test_asset_loading(self):
     first_path = self._v1_asset_saved_model()
     imported = load.load(first_path)
+    self.evaluate(lookup_ops.tables_initializer())
     fn = imported.signatures["serving_default"]
     self.assertAllClose({"output": [2, 0]},
                         fn(start=constant_op.constant(["gamma", "alpha"])))
@@ -194,7 +233,9 @@ class LoadTest(test.TestCase):
                                str(ops.uid()))
     save.save(imported, second_path, signatures=imported.signatures)
     shutil.rmtree(first_path)
+    del ops.get_collection_ref(ops.GraphKeys.TABLE_INITIALIZERS)[:]
     second_import = load.load(second_path)
+    self.evaluate(lookup_ops.tables_initializer())
     fn = second_import.signatures["serving_default"]
     self.assertAllClose({"output": [2, 0]},
                         fn(start=constant_op.constant(["gamma", "alpha"])))
@@ -203,7 +244,9 @@ class LoadTest(test.TestCase):
                               str(ops.uid()))
     save.save(second_import, third_path, signatures=second_import.signatures)
     shutil.rmtree(second_path)
+    del ops.get_collection_ref(ops.GraphKeys.TABLE_INITIALIZERS)[:]
     third_import = load.load(third_path)
+    self.evaluate(lookup_ops.tables_initializer())
     fn = third_import.signatures["serving_default"]
     self.assertAllClose({"output": [2, 0]},
                         fn(start=constant_op.constant(["gamma", "alpha"])))
@@ -294,8 +337,8 @@ class LoadTest(test.TestCase):
   def _no_signatures_model(self):
     export_graph = ops.Graph()
     with export_graph.as_default():
-      array_ops.placeholder(name="x", shape=[], dtype=dtypes.float32)
-
+      inp = array_ops.placeholder(name="x", shape=[], dtype=dtypes.float32)
+      array_ops.identity(inp + 1., name="out")
       with session_lib.Session() as session:
         path = os.path.join(self.get_temp_dir(), "saved_model", str(ops.uid()))
         b = builder_impl.SavedModelBuilder(path)
@@ -334,6 +377,13 @@ class LoadTest(test.TestCase):
     imported = load.load(path)
     self.assertEqual([2], imported.signatures["key"]()["value"].shape)
 
+  def test_version_info(self):
+    path = self._signature_with_no_inputs()
+    imported = load.load(path)
+    self.assertEqual(versions.__version__, imported.tensorflow_version)
+    self.assertEqual(versions.__git_version__,
+                     imported.tensorflow_git_version)
+
   def _unfed_placeholder_signature(self):
     export_graph = ops.Graph()
     with export_graph.as_default():
@@ -358,5 +408,86 @@ class LoadTest(test.TestCase):
         "signature needs an input for each placeholder.*\n\nUnable to lift"):
       load.load(path)
 
+  def test_custom_pruning(self):
+    path = self._no_signatures_model()
+    root = load.load(path)
+    fn = root.prune("x:0", "out:0")
+    self.assertEqual(2., self.evaluate(fn(x=array_ops.ones([]))))
+    root.graph.as_graph_element("x:0")
+
+  def _no_trainable_variable_attribute(self, trainable):
+    """A SavedModel where the VariableDef has no 'trainable' (it's false)."""
+
+    class _MissingFieldsVariable(resource_variable_ops.ResourceVariable):
+
+      def to_proto(self, export_scope=None):
+        full_proto = super(_MissingFieldsVariable, self).to_proto(export_scope)
+        return variable_pb2.VariableDef(
+            variable_name=full_proto.variable_name,
+            initial_value_name=full_proto.initial_value_name,
+            initializer_name=full_proto.snapshot_name,
+            save_slice_info_def=full_proto.save_slice_info_def,
+            is_resource=full_proto.is_resource)
+
+    export_graph = ops.Graph()
+    with export_graph.as_default():
+      v = _MissingFieldsVariable(3., trainable=trainable)
+      with session_lib.Session() as session:
+        session.run([v.initializer])
+        path = os.path.join(self.get_temp_dir(), "saved_model", str(ops.uid()))
+        b = builder_impl.SavedModelBuilder(path)
+        b.add_meta_graph_and_variables(
+            session,
+            tags=[tag_constants.SERVING],
+            signature_def_map={})
+        b.save()
+
+    return path
+
+  def test_trainable_not_set_in_proto(self):
+    """If a VariableDef has no 'trainable', we fall back to collections."""
+    real_tf_version = versions.__version__
+    # Pretend to be exported from an older version of TensorFlow, so trainable
+    # will follow collections instead of checking VariableDefs.
+    versions.__version__ = "1.7.0"
+    path = self._no_trainable_variable_attribute(trainable=True)
+    root = load.load(path)
+    self.assertTrue(root.variables[0].trainable)
+    path = self._no_trainable_variable_attribute(trainable=False)
+    root = load.load(path)
+    self.assertFalse(root.variables[0].trainable)
+    versions.__version__ = real_tf_version
+
+  def _export_variable(self, **kwargs_for_variable):
+    """A 1.x SavedModel with a single variable."""
+    export_graph = ops.Graph()
+    with export_graph.as_default():
+      v = resource_variable_ops.ResourceVariable(3., **kwargs_for_variable)
+      with session_lib.Session() as session:
+        session.run([v.initializer])
+        path = os.path.join(self.get_temp_dir(), "saved_model", str(ops.uid()))
+        b = builder_impl.SavedModelBuilder(path)
+        b.add_meta_graph_and_variables(
+            session,
+            tags=[tag_constants.SERVING],
+            signature_def_map={})
+        b.save()
+
+    return path
+
+  def test_trainable_in_proto(self):
+    """If a VariableDef has a trainable property, we do not use collections."""
+    path = self._export_variable(
+        trainable=True,
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES])
+    root = load.load(path)
+    self.assertTrue(root.variables[0].trainable)
+    path = self._export_variable(
+        trainable=False,
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES,
+                     ops.GraphKeys.TRAINABLE_VARIABLES])
+    root = load.load(path)
+    self.assertFalse(root.variables[0].trainable)
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/model_utils/BUILD b/tensorflow/python/saved_model/model_utils/BUILD
index 493574a225d..7c2a5d04b24 100644
--- a/tensorflow/python/saved_model/model_utils/BUILD
+++ b/tensorflow/python/saved_model/model_utils/BUILD
@@ -49,6 +49,7 @@ py_library(
 py_test(
     name = "export_output_test",
     srcs = ["export_output_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":export_output",
@@ -83,6 +84,7 @@ py_library(
 py_test(
     name = "export_test",
     srcs = ["export_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":export_utils",
@@ -109,6 +111,7 @@ py_library(
 py_test(
     name = "mode_keys_test",
     srcs = ["mode_keys_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":mode_keys",
diff --git a/tensorflow/python/saved_model/signature_serialization.py b/tensorflow/python/saved_model/signature_serialization.py
index 0cd64ee5cad..dfb2d452cf3 100644
--- a/tensorflow/python/saved_model/signature_serialization.py
+++ b/tensorflow/python/saved_model/signature_serialization.py
@@ -136,7 +136,7 @@ def canonicalize_signatures(signatures):
 def _is_flat(sequence):
   sequence_flat = nest.flatten(sequence)
   try:
-    nest.assert_same_structure(sequence_flat, sequence)
+    nest.assert_same_structure(sequence_flat, sequence, check_types=False)
     return True
   except ValueError:
     return False
diff --git a/tensorflow/python/tools/api/generator/BUILD b/tensorflow/python/tools/api/generator/BUILD
index 109c71b41d0..59f20d52a0c 100644
--- a/tensorflow/python/tools/api/generator/BUILD
+++ b/tensorflow/python/tools/api/generator/BUILD
@@ -43,6 +43,7 @@ py_test(
         "create_python_api.py",
         "create_python_api_test.py",
     ],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":doc_srcs",
@@ -59,6 +60,7 @@ py_test(
         "--api_name=tensorflow",
     ] + KERAS_API_INIT_FILES + KERAS_API_INIT_FILES_V1 + TENSORFLOW_API_INIT_FILES + TENSORFLOW_API_INIT_FILES_V1,
     main = "doc_srcs_test.py",
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":doc_srcs",
@@ -74,6 +76,7 @@ py_test(
         "api_init_files.bzl",
         "api_init_files_v1.bzl",
     ],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
index 81158f1fac3..171d44c1fee 100644
--- a/tensorflow/python/tpu/BUILD
+++ b/tensorflow/python/tpu/BUILD
@@ -275,30 +275,6 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
-    name = "tpu_config_test",
-    size = "small",
-    srcs = ["tpu_config_test.py"],
-    additional_deps = [
-        ":tpu_estimator",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-    ],
-)
-
-tf_py_test(
-    name = "tpu_estimator_signals_test",
-    size = "small",
-    srcs = ["tpu_estimator_signals_test.py"],
-    additional_deps = [
-        ":tpu_estimator",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-    ],
-    # TODO(jhseu): Remove. Fails in OSS on Python 3.
-    tags = ["no_oss"],
-)
-
 tf_py_test(
     name = "topology_test",
     size = "medium",
diff --git a/tensorflow/python/tpu/_tpu_estimator_embedding.py b/tensorflow/python/tpu/_tpu_estimator_embedding.py
index 4a832dbbe3a..d85aae64871 100644
--- a/tensorflow/python/tpu/_tpu_estimator_embedding.py
+++ b/tensorflow/python/tpu/_tpu_estimator_embedding.py
@@ -1,366 +1,23 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-"""Tooling for support TPU embedding in TPUEstimator."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.feature_column import feature_column as core_fc
-from tensorflow.python.feature_column import feature_column_lib as core_fc_lib
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import math_ops
-from tensorflow.python.tpu import feature_column as tpu_fc
-from tensorflow.python.tpu import tpu_embedding
-from tensorflow.python.tpu.tpu_embedding import AdagradParameters
-from tensorflow.python.tpu.tpu_embedding import AdamParameters
-from tensorflow.python.tpu.tpu_embedding import StochasticGradientDescentParameters
-from tensorflow.python.training import training
-
-# pylint: disable=protected-access
-_TPU_EMBEDDING_COLUMN_CLASSES = (tpu_fc._TPUEmbeddingColumn,
-                                 tpu_fc._TPUSharedEmbeddingColumn)
-_EMBEDDING_COLUMN_CLASSES = (core_fc._EmbeddingColumn,
-                             core_fc_lib.EmbeddingColumn,
-                             core_fc._SharedEmbeddingColumn)
-_SUPPORTED_FEATURE_COLUMNS = (core_fc._NumericColumn, core_fc_lib.NumericColumn)
-_SUPPORTED_OPTIMIZERS = (AdagradParameters, AdamParameters,
-                         StochasticGradientDescentParameters)
-
-# pylint: enable=protected-access
-
-_TABLE_NAME_PREFIX = 'tbl_'
-_LEN_TABLE_NAME_PREFIX = len(_TABLE_NAME_PREFIX)
-
-
-def _get_table_name_from_embedding_var_name(embedding_var_name):
-  return '{}{}'.format(_TABLE_NAME_PREFIX, embedding_var_name)
-
-
-def _get_embedding_var_name_from_table_name(table_name):
-  return table_name[_LEN_TABLE_NAME_PREFIX:]
-
-
-def _get_embedding_variable_name(scope_name, var_name):
-  return '{}/{}'.format(scope_name, var_name)
-
-
-def _get_slot_variable_names(scope_name, var_name, optimization_parameters):
-  """Return embedding variable names which are consistent with CPU runs."""
-  if isinstance(optimization_parameters, tpu_embedding.AdagradParameters):
-    return tpu_embedding.AdagradSlotVariableName(
-        '{}/{}/Adagrad'.format(scope_name, var_name)
-    )
-  elif isinstance(optimization_parameters, tpu_embedding.AdamParameters):
-    return tpu_embedding.AdamSlotVariableNames(
-        '{}/{}/Adam/m'.format(scope_name, var_name),
-        '{}/{}/Adam/v'.format(scope_name, var_name)
-    )
-  elif isinstance(optimization_parameters,
-                  tpu_embedding.StochasticGradientDescentParameters):
-    return None
-  else:
-    raise ValueError('Support to infer full variable name '
-                     'for optimization_parameter {} has not been added.'
-                     .format(optimization_parameters))
-
-
-def get_full_variable_names(
-    graph, table_to_config_dict, optimization_parameters=None):
-  """Return embedding variable names and slot variables which are consistent with CPU runs."""
-  collection = graph.get_collection_ref(tpu_fc._TPU_FC_TO_SCOPE)  # pylint: disable=protected-access
-  if not collection:
-    raise RuntimeError(
-        'Embedding feature column did not capture any thing. Make sure the '
-        'feature columns passed to TPUEstimator constructor is properly '
-        'used in model_fn.')
-
-  embedding_variable_name_by_table = {}
-  slot_variable_names_by_table = {}
-  for table_name in table_to_config_dict:
-    embedding_var_name = _get_embedding_var_name_from_table_name(table_name)
-    (scope_name, var_name) = collection[0][embedding_var_name]
-    embedding_variable_name_by_table[table_name] = (
-        _get_embedding_variable_name(scope_name, var_name))
-    if optimization_parameters:
-      slot_variable_names_by_table[table_name] = _get_slot_variable_names(
-          scope_name, var_name, optimization_parameters)
-
-  graph.clear_collection(tpu_fc._TPU_FC_TO_SCOPE)  # pylint: disable=protected-access
-  return embedding_variable_name_by_table, slot_variable_names_by_table
-
-
-def get_configs_from_feature_columns(feature_columns):
-  """Create configs for TPUEmbedding etc from a list of feature columns.
-
-  Args:
-    feature_columns: a list of supported feature columns.
-
-  Returns:
-    A tuple of dicts, the first maps tables to their config, the second maps
-    features to their config, and the third maps features to weight key names.
-  """
-
-  allowed = (tpu_fc._TPUEmbeddingColumn, tpu_fc._TPUSharedEmbeddingColumn)  # pylint: disable=protected-access
-
-  for column in feature_columns:
-    if not isinstance(column, allowed):
-      raise TypeError(
-          'Unsupported feature column {}. Supported types are {}.'.format(
-              type(column), allowed))
-
-  table_to_config = {}
-  feature_to_config = {}
-  feature_to_weight_key_name = {}
-  for column in feature_columns:
-    feature_name = column.get_feature_key_name()
-    table_name = _get_table_name_from_embedding_var_name(
-        column.get_embedding_var_name())
-    if feature_name in feature_to_config:
-      raise ValueError(
-          'Feature column {} is used with multiple embeddings and this is '
-          'not supported.'.format(feature_name))
-    feature_to_config[feature_name] = tpu_embedding.FeatureConfig(
-        table_id=table_name)
-    feature_to_weight_key_name[feature_name] = column.get_weight_key_name()
-    vocabulary_size, dimension = column.get_embedding_table_size()
-    table_to_config[table_name] = tpu_embedding.TableConfig(
-        vocabulary_size=vocabulary_size,
-        dimension=dimension,
-        initializer=column.get_initializer(),
-        combiner=column.get_combiner())
-
-  return table_to_config, feature_to_config, feature_to_weight_key_name
-
-
-class EmbeddingConfigSpec(
-    collections.namedtuple('EmbeddingConfigSpec', [
-        'feature_columns', 'optimization_parameters', 'clipping_limit',
-        'pipeline_execution_with_tensor_core',
-        'experimental_gradient_multiplier_fn'
-    ])):
-  """Class to keep track of embedding config specification."""
-
-  def __new__(cls,
-              feature_columns,
-              optimization_parameters,
-              clipping_limit=None,
-              pipeline_execution_with_tensor_core=False,
-              experimental_gradient_multiplier_fn=None):
-    """Creates an EmbeddingConfigSpec instance.
-
-    Args:
-      feature_columns: All `FeatureColumn`s used by model.
-      optimization_parameters: An instance of `AdagradParameters`,
-        `AdamParameters` or `StochasticGradientDescentParameters`. This
-        optimizer will be applied to all embedding variables specified by
-        `feature_columns`.
-      clipping_limit: (Optional) Clipping limit (absolute value).
-      pipeline_execution_with_tensor_core: setting this to `True` makes training
-        faster, but trained model will be different if step N and step N+1
-        involve the same set of embedding IDs. Please see
-        `tpu_embedding_configuration.proto` for details.
-      experimental_gradient_multiplier_fn: (Optional) A Fn taking global step as
-        input returning the current multiplier for all embedding gradients.
-
-    Returns:
-      An EmbeddingConfigSpec instance.
-
-    Raises:
-      ValueError: If the feature_columns are not specified.
-      TypeError: If the feature columns are not of ths correct type (one of
-        _SUPPORTED_FEATURE_COLUMNS, _TPU_EMBEDDING_COLUMN_CLASSES OR
-        _EMBEDDING_COLUMN_CLASSES).
-      ValueError: If `optimization_parameters` is not one of the required types.
-    """
-    if not feature_columns:
-      raise ValueError('`feature_columns` cannot be `None` or empty.')
-
-    # It is unknown at this moment, whether the TPUEstimator is running in CPU
-    # or TPU mode. So allow non-TPU embedding columns also.
-    supported_classes = tuple(
-        list(_SUPPORTED_FEATURE_COLUMNS) + list(_TPU_EMBEDDING_COLUMN_CLASSES) +
-        list(_EMBEDDING_COLUMN_CLASSES))
-
-    for column in feature_columns:
-      if not isinstance(column, supported_classes):
-        raise TypeError(
-            'All feature columns must be supported types in {}. Got {}'.format(
-                supported_classes, type(column)))
-
-    if not isinstance(optimization_parameters, _SUPPORTED_OPTIMIZERS):
-      raise ValueError('optimization_parameters must be an instance of type '
-                       '{}. Got {}.'.format(_SUPPORTED_OPTIMIZERS,
-                                            type(optimization_parameters)))
-
-    return super(EmbeddingConfigSpec, cls).__new__(
-        cls,
-        feature_columns=feature_columns,
-        optimization_parameters=optimization_parameters,
-        clipping_limit=clipping_limit,
-        pipeline_execution_with_tensor_core=pipeline_execution_with_tensor_core,
-        experimental_gradient_multiplier_fn=experimental_gradient_multiplier_fn)
-
-
-class EmbeddingConfig(object):
-  """This is the internal immutable object for embedding config.
-
-  `_EmbeddingConfig` is responsible to _translate_ user provided
-  `EmbeddingConfigSpec` to internal data structures, mostly constructor
-  arguments of `TPUEmbedding`.
-  """
-
-  def __init__(self, embedding_config_spec, train_batch_size, eval_batch_size,
-               num_hosts, num_cores, run_config):
-    if not embedding_config_spec:
-      raise ValueError('embedding_config_spec cannot be None.')
-
-    self._embedding_config_spec = embedding_config_spec
-    self._train_batch_size = train_batch_size
-    self._eval_batch_size = eval_batch_size
-    self._num_hosts = num_hosts
-    self._num_cores = num_cores
-    self._run_config = run_config
-
-    (self._table_to_config_dict, self._feature_to_config_dict,
-     self.feature_to_weight_key_name_dict) = (
-         get_configs_from_feature_columns(
-             embedding_config_spec.feature_columns))
-    self._mode_to_tpu_embedding_dict = {}
-    self.dummy_table_variables = None
-
-    self._grad_multiplier_fn = (
-        embedding_config_spec.experimental_gradient_multiplier_fn)
-
-  def get_grad_multiplier(self):
-    if self._grad_multiplier_fn:
-      return ops.convert_to_tensor(
-          self._grad_multiplier_fn(training.get_global_step()),
-          dtype=dtypes.float32)
-
-  def has_embedding_tables(self):
-    return bool(self._table_to_config_dict)
-
-  def _create_tpu_embedding(self, mode):
-    """Create tpu_embedding.TPUEmbedding based on mode."""
-    if mode == model_fn_lib.ModeKeys.TRAIN:
-      batch_size = self._train_batch_size
-    else:
-      batch_size = self._eval_batch_size
-
-    if mode == model_fn_lib.ModeKeys.TRAIN:
-      tpu_embedding_mode = tpu_embedding.TRAINING
-      optimization_parameters = (
-          self._embedding_config_spec.optimization_parameters)
-    elif (mode == model_fn_lib.ModeKeys.EVAL or
-          mode == model_fn_lib.ModeKeys.PREDICT):
-      tpu_embedding_mode = tpu_embedding.INFERENCE
-      optimization_parameters = None
-    else:
-      raise ValueError('Mode {} is not supported.'.format(mode))
-
-    if self._run_config.cluster:
-      master = self._run_config.cluster.master()
-      cluster_spec = self._run_config.cluster.cluster_spec()
-      cluster_def = cluster_spec.as_cluster_def() if cluster_spec else None
-    else:
-      master = (
-          self._run_config.evaluation_master
-          if mode == model_fn_lib.ModeKeys.EVAL else self._run_config.master)
-      cluster_def = None
-    tpu_embedding_ = tpu_embedding.TPUEmbedding(
-        self._table_to_config_dict,
-        self._feature_to_config_dict,
-        batch_size,
-        tpu_embedding_mode,
-        master,
-        optimization_parameters,
-        cluster_def,
-        pipeline_execution_with_tensor_core=self._embedding_config_spec
-        .pipeline_execution_with_tensor_core)
-    return tpu_embedding_
-
-  def get_tpu_embedding(self, mode):
-    if mode not in self._mode_to_tpu_embedding_dict:
-      self._mode_to_tpu_embedding_dict[mode] = (
-          self._create_tpu_embedding(mode))
-    return self._mode_to_tpu_embedding_dict[mode]
-
-
-def split_inputs(ctx, features, labels):
-  """Splits the dense and sparse tensors inside the features and labels."""
-  enqueue_datas = collections.OrderedDict()
-  if ctx.embedding_config:
-    tpu_embedding_ = ctx.embedding_config.tpu_embedding
-    feature_to_weight_key_name_dict = (
-        ctx.embedding_config.feature_to_weight_key_name_dict)
-    for feature_key in tpu_embedding_.feature_to_config_dict:
-      sparse_feature = _get_sparse_feature_from_feature(feature_key, features)
-      weight_key_name = feature_to_weight_key_name_dict[feature_key]
-      if isinstance(sparse_feature, sparse_tensor.SparseTensor):
-        weights = _get_weights_from_features(weight_key_name, features)
-        enqueue_data = tpu_embedding.EnqueueData.from_sparse_tensor(
-            sparse_feature, weights)
-      else:
-        if weight_key_name is not None:
-          raise ValueError(
-              'Found weights {} for weighted_categorical_column, which is not'
-              'compatible with sparse feature {} enqueued as dense tensor.'
-              .format(weight_key_name, feature_key))
-        enqueue_data = tpu_embedding.EnqueueData(sparse_feature)
-      enqueue_datas[feature_key] = enqueue_data
-
-  return features, labels, enqueue_datas
-
-
-def _get_sparse_feature_from_feature(feature_key, features):
-  """Pop and return sparse feature."""
-  sparse_feature = features.pop(feature_key)
-  if not sparse_feature.dtype.is_integer:
-    raise ValueError('SparseTensor with string as values are not supported. '
-                     'If you are using vocabulary_file_categorical_column or '
-                     'vocabulary_list_categorical_column, please call '
-                     'your_column.categorical_column._transform_feature({{'
-                     'your_column.key: features[your_column.key]}}) in'
-                     'your input_fn() to convert string to int. '
-                     'feature_key = {}.'.format(feature_key))
-  return sparse_feature
-
-
-def _get_weights_from_features(weight_key_name, features):
-  """Pop and return feature for weights, possibly None."""
-  weights = None
-  if weight_key_name is not None:
-    if weight_key_name in features:
-      weights = features.pop(weight_key_name)
-    else:
-      raise ValueError(
-          'Cannot find weights {} for weighted_categorical_column.'
-          ' Please check if the weights are present in feature dict. Also'
-          ' note weight-sharing among weighted_categorical_column is not '
-          'supported on TPU.'.format(weight_key_name))
-    if not isinstance(weights, sparse_tensor.SparseTensor):
-      raise ValueError(
-          'weighted_categorical_column with weight key name {} has dense '
-          'weights. Dense weights are not supported on TPU. Please use '
-          'sparse weights instead.'.format(weight_key_name))
-    if weights.dtype is not dtypes.float32:
-      weights = math_ops.to_float(weights)
-  return weights
+# pylint: disable=wildcard-import,unused-import
+from tensorflow_estimator.python.estimator.tpu._tpu_estimator_embedding import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/python/tpu/error_handling.py b/tensorflow/python/tpu/error_handling.py
index 87f0b303d46..9cbb5084a54 100644
--- a/tensorflow/python/tpu/error_handling.py
+++ b/tensorflow/python/tpu/error_handling.py
@@ -1,135 +1,23 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-"""ErrorRendezvous handler for collecting errors from multiple threads."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import contextlib
-import sys
-import threading
-import time
-
-import six
-
-from tensorflow.python.framework import errors
-from tensorflow.python.platform import tf_logging as logging
-
-_UNINTERESTING_ERRORS = (errors.CancelledError,)
-
-
-class ErrorRendezvous(object):
-  """Resolve errors from multiple threads during TPU execution.
-
-  TPU errors can occur on the infeed or outfeed threads as well as the main
-  training thread.
-
-  Depending on which thread "wins" and receives the session error first, we may
-  end up showing users a confusing and non-actionable error message (session
-  cancelled) instead of a root cause (e.g. a bad filename).
-
-  The rendezvous object provides a location to capture these errors until all
-  threads terminate.  At that point we can choose the most informative error
-  to report.
-  """
-
-  def __init__(self, num_sources):
-    # string -> (message, traceback)
-    self._errors = {}
-    self._num_sources = num_sources
-    self._session_cancel_timer = None
-
-  def record_error(self, source, exc_info, session=None):
-    """Report an exception from the given source.
-
-    If a session is passed, a timer will be registered to close it after a few
-    seconds.  This is necessary to ensure the main training loop does not hang
-    if an infeed/oufeed error occurs.  We sleep a few seconds to allow a more
-    interesting error from another thread to propagate.
-
-    Args:
-      source: string, source of the error
-      exc_info: Output from `sys.exc_info` (type, value, traceback)
-      session: Session to close after delay.
-    """
-    _, value, _ = exc_info
-    self._errors[source] = exc_info
-    logging.error('Error recorded from %s: %s', source, value)
-
-    if session is not None and self._session_cancel_timer is None:
-
-      def _cancel_session():
-        time.sleep(5)
-        logging.error('Closing session due to error %s' % value)
-        try:
-          session.close()
-        except:  # pylint: disable=bare-except
-          logging.error(
-              '\n\n\nFailed to close session after error.'
-              'Other threads may hang.\n\n\n')
-
-      self._session_cancel_timer = threading.Thread(target=_cancel_session,)
-      self._session_cancel_timer.daemon = True
-      self._session_cancel_timer.start()
-
-  def record_done(self, source):
-    """Mark execution source `source` as done.
-
-    If an error was originally reported from `source` it is left intact.
-
-    Args:
-      source: `str`, source being recorded
-    """
-    logging.info('%s marked as finished', source)
-    if source not in self._errors:
-      self._errors[source] = None
-
-  @contextlib.contextmanager
-  def catch_errors(self, source, session=None):
-    """Context manager to report any errors within a block."""
-    try:
-      yield
-    except Exception:  # pylint: disable=broad-except
-      self.record_error(source, sys.exc_info(), session)
-
-  def raise_errors(self, timeout_sec=0):
-    """Wait for up to `timeout` seconds for all error sources to finish.
-
-    Preferentially raise "interesting" errors (errors not in the
-    _UNINTERESTING_ERRORS) set.
-
-    Args:
-      timeout_sec: Seconds to wait for other error sources.
-    """
-    for _ in range(timeout_sec):
-      if len(self._errors) == self._num_sources:
-        break
-      time.sleep(1)
-
-    kept_errors = [(k, v) for (k, v) in self._errors.items() if v is not None]
-
-    # First check for any interesting errors, then fall back on the session
-    # cancelled errors etc.
-    for k, (typ, value, traceback) in kept_errors:
-      if isinstance(value, _UNINTERESTING_ERRORS):
-        continue
-      else:
-        logging.warn('Reraising captured error')
-        six.reraise(typ, value, traceback)
-
-    for k, (typ, value, traceback) in kept_errors:
-      logging.warn('Reraising captured error')
-      six.reraise(typ, value, traceback)
+# pylint: disable=wildcard-import,unused-import
+from tensorflow_estimator.python.estimator.tpu.error_handling import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/python/tpu/tpu_config.py b/tensorflow/python/tpu/tpu_config.py
index cc5a8b0efe2..2c9bce0bca2 100644
--- a/tensorflow/python/tpu/tpu_config.py
+++ b/tensorflow/python/tpu/tpu_config.py
@@ -1,295 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-
-"""A RunConfig subclass with TPU support."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import json
-import os
-
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.estimator import run_config as run_config_lib
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.tpu import util as util_lib
-
-# pylint: disable=protected-access
-_TF_CONFIG_ENV = run_config_lib._TF_CONFIG_ENV
-_SERVICE_KEY = run_config_lib._SERVICE_KEY
-_TPU_WORKER_JOB_NAME = 'tpu_worker_job_name'
-# pylint: enable=protected-access
-
-
-class InputPipelineConfig(object):
-  r"""Please see the definition of these values in TPUConfig."""
-  PER_SHARD_V1 = 1
-  PER_HOST_V1 = 2
-  PER_HOST_V2 = 3
-  BROADCAST = 4
-  SLICED = 5
-
-
-class TPUConfig(
-    collections.namedtuple('TPUConfig', [
-        'iterations_per_loop',
-        'num_shards',
-        'num_cores_per_replica',
-        'per_host_input_for_training',
-        'tpu_job_name',
-        'initial_infeed_sleep_secs',
-        'input_partition_dims',
-        'eval_training_input_configuration',
-    ])):
-  r"""TPU related configuration required by `TPUEstimator`.
-
-  Args:
-    iterations_per_loop: This is the number of train steps running in TPU
-      system before returning to CPU host for each `Session.run`. This means
-      global step is increased `iterations_per_loop` times in one `Session.run`.
-      It is recommended to be set as number of global steps for next checkpoint.
-      Note that in evaluation don't use this value, instead we run total eval
-      `steps` on TPU for a single `Session.run`.
-    num_shards: (Deprecated, ignored by TPUEstimator).
-      The number of model replicas in the system. For non-model-parallelism
-      case, this number equals the total number of TPU cores. For
-      model-parallelism, the total number of TPU cores equals
-      num_cores_per_replica * num_shards.
-    num_cores_per_replica: Defaults to `None`, which disables model parallelism.
-      An integer which describes the number of TPU cores per model replica. This
-      is required by model-parallelism which enables partitioning
-      the model to multiple cores. Currently num_cores_per_replica must be
-      1, 2, 4, or 8.
-    per_host_input_for_training: If `True`, `PER_HOST_V1`, or `PER_HOST_V2`,
-      `input_fn` is invoked once on each host. With the per-core input pipeline
-      configuration, it is invoked once for each core.
-      With a global batch size `train_batch_size` in `TPUEstimator` constructor,
-      the batch size for each shard is `train_batch_size` // #hosts in the
-      `True` or `PER_HOST_V1` mode. In `PER_HOST_V2` mode, it is
-      `train_batch_size` // #cores. In `BROADCAST` mode, `input_fn` is only
-      invoked once on host 0 and the tensors are broadcasted to all other
-      replicas. The batch size equals to `train_batch_size`. With the per-core
-      input pipeline configuration, the shard batch size is also
-      `train_batch_size` // #cores.
-      Note: per_host_input_for_training==PER_SHARD_V1 only supports mode.TRAIN.
-    tpu_job_name: The name of the TPU job. Typically, this name is auto-inferred
-      within TPUEstimator, however when using ClusterSpec propagation in more
-      esoteric cluster configurations, you may need to specify the job name as a
-      string.
-    initial_infeed_sleep_secs: The number of seconds the infeed thread should
-      wait before enqueueing the first batch. This helps avoid timeouts for
-      models that require a long compilation time.
-    input_partition_dims: A nested list to describe the partition dims
-      for all the tensors from input_fn(). The structure of
-      input_partition_dims must match the structure of `features` and
-      `labels` from input_fn(). The total number of partitions must match
-      `num_cores_per_replica`. For example, if input_fn() returns two tensors:
-      images with shape [N, H, W, C] and labels [N].
-      input_partition_dims = [[1, 2, 2, 1], None] will split the images to 4
-      pieces and feed into 4 TPU cores. labels tensor are directly broadcasted
-      to all the TPU cores since the partition dims is `None`.
-      Current limitations: This feature is only supported with the PER_HOST_V2
-      input mode.
-    eval_training_input_configuration: If `SLICED`, `input_fn` is only
-      invoked once on host 0 and the tensors are broadcasted to all other
-      replicas. Unlike per_host_input_for_training=BROADCAST, each replica will
-      only get a slice of the data instead of a whole copy. If `PER_HOST_V1`,
-      the behaviour is determined by per_host_input_for_training.
-
-    Raises:
-      ValueError: If `num_cores_per_replica` is not 1, 2, 4, 8 or 16.
-  """
-
-  def __new__(
-      cls,
-      iterations_per_loop=2,
-      num_shards=None,
-      num_cores_per_replica=None,
-      per_host_input_for_training=True,
-      tpu_job_name=None,
-      initial_infeed_sleep_secs=None,
-      input_partition_dims=None,
-      eval_training_input_configuration=InputPipelineConfig.PER_HOST_V1):
-
-    # Check iterations_per_loop.
-    util_lib.check_positive_integer(iterations_per_loop,
-                                    'TPUConfig iterations_per_loop')
-
-    # Check num_shards.
-    if num_shards is not None:
-      util_lib.check_positive_integer(num_shards, 'TPUConfig num_shards')
-
-    if input_partition_dims is not None:
-      if len(input_partition_dims) != 1 and len(input_partition_dims) != 2:
-        raise ValueError(
-            'input_partition_dims must be a list/tuple with one or two'
-            ' elements.')
-
-      if per_host_input_for_training is not InputPipelineConfig.PER_HOST_V2:
-        raise ValueError(
-            'input_partition_dims is only supported in PER_HOST_V2 mode.')
-
-      if num_cores_per_replica is None:
-        raise ValueError(
-            'input_partition_dims requires setting num_cores_per_replica.')
-
-    # Check num_cores_per_replica
-    if num_cores_per_replica is not None:
-      if num_cores_per_replica not in [1, 2, 4, 8, 16]:
-        raise ValueError(
-            'num_cores_per_replica must be 1, 2, 4, 8, or 16; got {}'.format(
-                str(num_cores_per_replica)))
-
-    if eval_training_input_configuration not in [
-        InputPipelineConfig.PER_HOST_V1, InputPipelineConfig.SLICED
-    ]:
-      raise ValueError(
-          'eval_training_input_configuration must be PER_HOST_V1 or SLICED;'
-          ' got {}'.format(str(eval_training_input_configuration)))
-
-    # per_host_input_for_training may be True, False, or integer in [1..3].
-    # Map legacy values (True, False) to numeric values.
-    if per_host_input_for_training is False:
-      per_host_input_for_training = InputPipelineConfig.PER_SHARD_V1
-    elif per_host_input_for_training is True:
-      per_host_input_for_training = InputPipelineConfig.PER_HOST_V1
-
-    # Check initial_infeed_sleep_secs.
-    if initial_infeed_sleep_secs:
-      util_lib.check_positive_integer(initial_infeed_sleep_secs,
-                                      'TPUConfig initial_infeed_sleep_secs')
-
-    tpu_job_name = tpu_job_name or _get_tpu_job_name_from_tf_config()
-
-    return super(TPUConfig, cls).__new__(
-        cls,
-        iterations_per_loop=iterations_per_loop,
-        num_shards=num_shards,
-        num_cores_per_replica=num_cores_per_replica,
-        per_host_input_for_training=per_host_input_for_training,
-        tpu_job_name=tpu_job_name,
-        initial_infeed_sleep_secs=initial_infeed_sleep_secs,
-        input_partition_dims=input_partition_dims,
-        eval_training_input_configuration=eval_training_input_configuration)
-
-
-class RunConfig(run_config_lib.RunConfig):
-  """RunConfig with TPU support."""
-
-  def __init__(self,
-               tpu_config=None,
-               evaluation_master=None,
-               master=None,
-               cluster=None,
-               **kwargs):
-    """Constructs a RunConfig.
-
-    Args:
-      tpu_config: the TPUConfig that specifies TPU-specific configuration.
-      evaluation_master: a string. The address of the master to use for eval.
-        Defaults to master if not set.
-      master: a string. The address of the master to use for training.
-      cluster: a ClusterResolver
-      **kwargs: keyword config parameters.
-
-    Raises:
-      ValueError: if cluster is not None and the provided session_config has a
-        cluster_def already.
-    """
-    super(RunConfig, self).__init__(**kwargs)
-    self._tpu_config = tpu_config or TPUConfig()
-    self._cluster = cluster
-
-    # If user sets master and/or evaluation_master explicitly, including empty
-    # string '', take it. Otherwise, take the values set by parent class.
-    if master is not None:
-      if cluster is not None:
-        raise ValueError('Both master and cluster are set.')
-      self._master = master
-    else:
-      if cluster:
-        self._master = cluster.master()
-
-    if evaluation_master is not None:
-      self._evaluation_master = evaluation_master
-    elif (not self._evaluation_master and
-          self.task_type != run_config_lib.TaskType.EVALUATOR):
-      # If the task type is EVALUATOR, it means some cluster manager sets the
-      # TF_CONFIG. In that case, we respect the configuration in TF_CONFIG.
-      #
-      # Otherwise, it means user executes the code without external cluster
-      # manager. For that, we optimize the user experience by setting
-      # evaluation_master to master, unless user overwrites it.
-      self._evaluation_master = self._master
-
-    # Set the ClusterSpec to use
-    if cluster:
-      self._cluster_spec = cluster.cluster_spec()
-
-      # Merge the cluster_def into the ConfigProto.
-      if self._session_config is None:  # pylint: disable=access-member-before-definition
-        self._session_config = config_pb2.ConfigProto(
-            allow_soft_placement=True, isolate_session_state=True)
-      if self._session_config.HasField('cluster_def'):
-        raise ValueError(
-            'You cannot provide a ClusterResolver and '
-            'session_config.cluster_def.')
-      if self._cluster_spec:
-        self._session_config.cluster_def.CopyFrom(
-            self._cluster_spec.as_cluster_def())
-
-  def _maybe_overwrite_session_config_for_distributed_training(self):
-    # Overrides the parent class session_config overwrite for between-graph. TPU
-    # runs with in-graph, which should not have device filter. Doing nothing
-    # ("pass") basically disables it.
-    pass
-
-  @property
-  def evaluation_master(self):
-    return self._evaluation_master
-
-  @property
-  def master(self):
-    return self._master
-
-  @property
-  def tpu_config(self):
-    return self._tpu_config
-
-  @property
-  def cluster(self):
-    return self._cluster
-
-  def replace(self, **kwargs):
-    if 'tpu_config' not in kwargs:
-      return super(RunConfig, self).replace(**kwargs)
-
-    tpu_config = kwargs.pop('tpu_config')
-    new_instance = super(RunConfig, self).replace(**kwargs)
-    new_instance._tpu_config = tpu_config  # pylint: disable=protected-access
-    return new_instance
-
-
-def _get_tpu_job_name_from_tf_config():
-  """Extracts the TPU job name from TF_CONFIG env variable."""
-  # TODO(xiejw): Extends this to support both TF_CONFIG env variable and cluster
-  # spec propagation.
-  tf_config = json.loads(os.environ.get(_TF_CONFIG_ENV, '{}'))
-  tpu_job_name = tf_config.get(_SERVICE_KEY, {}).get(_TPU_WORKER_JOB_NAME)
-  if tpu_job_name:
-    logging.info('Load TPU job name from TF_CONFIG: %s', tpu_job_name)
-  return tpu_job_name
+# pylint: disable=wildcard-import,unused-import
+from tensorflow_estimator.python.estimator.tpu.tpu_config import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/python/tpu/tpu_config_test.py b/tensorflow/python/tpu/tpu_config_test.py
deleted file mode 100644
index 22fb3032169..00000000000
--- a/tensorflow/python/tpu/tpu_config_test.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""TPU RunConfig tests."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import json
-
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.estimator import run_config as run_config_lib
-from tensorflow.python.platform import test
-from tensorflow.python.tpu import tpu_config as tpu_config_lib
-
-
-def _set_tf_config_env_variable(tf_config):
-  return test.mock.patch.dict('os.environ', {
-      'TF_CONFIG': json.dumps(tf_config)
-  })
-
-
-class TPURunConfigTest(test.TestCase):
-
-  def test_no_session_config_set_in_local_case(self):
-    run_config = tpu_config_lib.RunConfig()
-    self.assertIsNone(run_config.session_config)
-
-  def test_no_session_config_overwrite_in_local_case(self):
-    session_config = config_pb2.ConfigProto(allow_soft_placement=True)
-    run_config = tpu_config_lib.RunConfig(session_config=session_config)
-    self.assertEqual(session_config, run_config.session_config)
-
-  def test_no_session_config_set_with_cluster_spec(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host3:3'],
-            run_config_lib.TaskType.WORKER: ['host3:4']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.CHIEF,
-            'index': 0
-        }
-    }
-    with _set_tf_config_env_variable(tf_config):
-      run_config = tpu_config_lib.RunConfig()
-      self.assertIsNone(run_config.session_config)
-
-  def test_no_session_config_overwrite_with_cluster_spec(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host3:3'],
-            run_config_lib.TaskType.WORKER: ['host3:4']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.CHIEF,
-            'index': 0
-        }
-    }
-    with _set_tf_config_env_variable(tf_config):
-      session_config = config_pb2.ConfigProto(allow_soft_placement=True)
-      run_config = tpu_config_lib.RunConfig(session_config=session_config)
-      self.assertEqual(session_config, run_config.session_config)
-
-  def test_fail_with_invalid_num_shards(self):
-    with self.assertRaisesRegexp(ValueError, 'must be positive'):
-      tpu_config_lib.RunConfig(
-          tpu_config=tpu_config_lib.TPUConfig(num_shards=0))
-
-  def test_fail_with_iterations_per_loop(self):
-    with self.assertRaisesRegexp(ValueError, 'must be positive'):
-      tpu_config_lib.RunConfig(
-          tpu_config=tpu_config_lib.TPUConfig(iterations_per_loop=0))
-
-  def test_fail_with_invalid_num_cores_per_replica(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'num_cores_per_replica must be 1, 2, 4, 8, or 16;'
-        ' got 7'):
-      tpu_config_lib.TPUConfig(num_cores_per_replica=7)
-
-
-class TPURunConfigMasterTest(test.TestCase):
-
-  def test_default_values(self):
-    run_config = tpu_config_lib.RunConfig()
-    self.assertEqual('', run_config.master)
-    self.assertEqual('', run_config.evaluation_master)
-
-  def test_user_provided_master_and_evaluation_master(self):
-    run_config = tpu_config_lib.RunConfig(
-        master='_master_123', evaluation_master='_eval_master_123')
-    self.assertEqual('_master_123', run_config.master)
-    self.assertEqual('_eval_master_123', run_config.evaluation_master)
-
-  def test_evaluation_master_defaults_to_master(self):
-    run_config = tpu_config_lib.RunConfig(master='_master_123')
-    self.assertEqual('_master_123', run_config.master)
-    self.assertEqual('_master_123', run_config.evaluation_master)
-
-  def test_tf_config(self):
-    tf_config = {
-        'session_master': '_master_123',
-        'eval_session_master': '_eval_master_123'
-    }
-    with _set_tf_config_env_variable(tf_config):
-      run_config = tpu_config_lib.RunConfig()
-      self.assertEqual('_master_123', run_config.master)
-      self.assertEqual('_eval_master_123', run_config.evaluation_master)
-
-  def test_evaluation_master_defaults_to_master_in_tf_config(self):
-    tf_config = {
-        'session_master': '_master_123',
-    }
-    with _set_tf_config_env_variable(tf_config):
-      run_config = tpu_config_lib.RunConfig()
-      self.assertEqual('_master_123', run_config.master)
-      self.assertEqual('_master_123', run_config.evaluation_master)
-
-  def test_respect_evaluation_master_in_tf_config(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host0:0'],
-        },
-        'task': {
-            'type': run_config_lib.TaskType.EVALUATOR,
-            'index': 0
-        },
-    }
-    with _set_tf_config_env_variable(tf_config):
-      run_config = tpu_config_lib.RunConfig(master='_something')
-      self.assertEqual('', run_config.evaluation_master)
-
-  def test_user_overwrites_tf_config(self):
-    tf_config = {
-        'session_master': '_master_123',
-        'eval_session_master': '_eval_master_123'
-    }
-    with _set_tf_config_env_variable(tf_config):
-      run_config = tpu_config_lib.RunConfig(
-          master='_new_master_123', evaluation_master='_new_eval_master_123')
-      self.assertEqual('_new_master_123', run_config.master)
-      self.assertEqual('_new_eval_master_123', run_config.evaluation_master)
-
-  def test_user_overwrites_master_in_tf_config(self):
-    tf_config = {
-        'session_master': '_master_123',
-        'eval_session_master': '_eval_master_123'
-    }
-    with _set_tf_config_env_variable(tf_config):
-      run_config = tpu_config_lib.RunConfig(master='_new_master_123')
-      self.assertEqual('_new_master_123', run_config.master)
-      self.assertEqual('_eval_master_123', run_config.evaluation_master)
-
-
-class TPUJobNameTest(test.TestCase):
-
-  def test_default_name(self):
-    config = tpu_config_lib.RunConfig()
-    self.assertIsNone(config.tpu_config.tpu_job_name)
-
-  def test_with_tf_config(self):
-    tf_config = {'service': {'tpu_worker_job_name': '_my_new_name',}}
-    with _set_tf_config_env_variable(tf_config):
-      config = tpu_config_lib.RunConfig()
-      self.assertEqual('_my_new_name', config.tpu_config.tpu_job_name)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/tpu/tpu_context.py b/tensorflow/python/tpu/tpu_context.py
index c6d25048842..573f49b2b9b 100644
--- a/tensorflow/python/tpu/tpu_context.py
+++ b/tensorflow/python/tpu/tpu_context.py
@@ -1,749 +1,23 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-"""TPU system metadata and associated tooling."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from contextlib import contextmanager
-import copy
-
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.tpu import _tpu_estimator_embedding
-from tensorflow.python.tpu import device_assignment as tpu_device_assignment
-from tensorflow.python.tpu import tpu_config
-from tensorflow.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
-
-
-_DEFAULT_JOB_NAME = 'tpu_worker'
-_DEFAULT_COORDINATOR_JOB_NAME = 'coordinator'
-_LOCAL_MASTERS = ('', 'local')
-_NUM_CORES_TO_COMPUTATION_SHAPE = {
-    1: [1, 1, 1],
-    2: [1, 1, 2],
-    4: [1, 2, 2],
-    8: [2, 2, 2],
-    16: [4, 2, 2],
-}
-
-
-class TPUContext(object):
-  """A context that holds the current configuration of the TPU computation."""
-
-  def __init__(self,
-               internal_ctx,
-               input_device=None,
-               invocation_index=None,
-               call_from_input_fn=True):
-    self._internal_ctx = internal_ctx
-    self._input_device = input_device
-    self._invocation_index = invocation_index
-    self._call_from_input_fn = call_from_input_fn
-
-  def current_input_fn_deployment(self):
-    """The configuration of the current input_fn invocation.
-
-    The configuration depends on `TPUConfig.per_host_input_for_training`. See
-    `TPUConfig` for details.
-
-    Only set in params dict of input_fn
-
-    Returns:
-      A tuple of
-        1. Device spec string: String, is the current CPU host where the
-           input_fn is invoked.
-        2. Current invocation index: Int, 0-based index of the input_fn
-           invocation. See next item for details.
-        3. Total invocation count: Int, the total number of times to invoke the
-           input_fn on all CPU hosts. Each invocation will be passed with a new
-           `TPUContext` instance with current invocation index set properly.
-        4. Total number of replicas consumed by current_invocation: Int, the
-           number of replicas fed by the data returned by current input_fn. For
-           example, for per_core input pipeline deployment
-           and non-model-parallelism, total invocation count is equal to
-           the number of cores in the system and num replicas consumed by
-           current invocation is 1. For per-host v2 input pipeline deployment,
-           total invocation count is equal to the number of hosts in the system
-           and num replicas consumed by current invocation is equal to number of
-           cores per host.
-
-    Raises:
-      RuntimeError: If this method must not be called from input_fn.
-    """
-    if not self._call_from_input_fn:
-      raise RuntimeError('This TPUContext instance must not be called from'
-                         ' model_fn.')
-
-    if self._internal_ctx.is_input_sharded_per_core():
-      total_invocation_count = (self._internal_ctx.num_hosts
-                                * self._internal_ctx.num_of_replicas_per_host)
-      replicas_consumed = 1
-    elif self._internal_ctx.is_input_broadcast_with_iterators():
-      total_invocation_count = 1
-      replicas_consumed = self._internal_ctx.num_replicas
-    else:
-      total_invocation_count = self._internal_ctx.num_hosts
-      replicas_consumed = self._internal_ctx.num_of_replicas_per_host
-    return (self._input_device, self._invocation_index,
-            total_invocation_count, replicas_consumed)
-
-  @property
-  def num_replicas(self):
-    """The total number of replicas.
-
-    For non-model-parallelism, num_replicas should be the total num of TPU
-    cores in the system.
-
-    Returns:
-      The number of replicas.
-    """
-    return self._internal_ctx.num_replicas
-
-  @property
-  def num_hosts(self):
-    """The number of hosts for the TPU system."""
-    return self._internal_ctx.num_hosts
-
-  @property
-  def current_host(self):
-    """The current host index for the TPU system."""
-    return self._invocation_index
-
-  @property
-  def num_of_replicas_per_host(self):
-    """The number of replicas for each host."""
-    if self._internal_ctx.model_parallelism_enabled:
-      raise ValueError(
-          'num_of_replicas_per_host is not supported for model_parallelism')
-    return self._internal_ctx.num_of_replicas_per_host
-
-  @property
-  def device_assignment(self):
-    """Returns device_assignment object."""
-    if self._call_from_input_fn:
-      raise RuntimeError('This TPUContext instance must not be called from'
-                         ' input_fn.')
-    return self._internal_ctx.device_assignment
-
-  def device_for_replica(self, replica_id):
-    """Returns the tuple of (CPU device and device ordinal) for replica.
-
-    This should be used for full replicate for non-model-parallelism.
-
-    Args:
-       replica_id: Int, the replica index.
-
-    Returns:
-       A tuple of device spec for CPU device and int device ordinal.
-    """
-    # Note that: For the non-model parallelism, the mapping could be
-    # a random permutation. The order should not matter in most cases
-    # as far as model is replicated to all cores in the system.
-    return self._internal_ctx.device_for_replica(replica_id)
-
-  @property
-  def tpu_host_placement_function(self):
-    """Returns the TPU host place function.
-
-    The place function takes host_id as the input and returns the TF device
-    for the correspoding host.
-    """
-
-    def _placement_function(host_id):
-      """Return the host device given host_id."""
-      return self._internal_ctx.tpu_host_placement_function(host_id=host_id)
-
-    return _placement_function
-
-
-class _InternalTPUContext(object):
-  """A context holds immutable states of TPU computation.
-
-  This immutable object holds TPUEstimator config, train/eval batch size, and
-  `TPUEstimator.use_tpu`, which is expected to be passed around. It also
-  provides utility functions, based on the current state, to determine other
-  information commonly required by TPU computation, such as TPU device names,
-  TPU hosts, shard batch size, etc.
-
-  if eval_on_tpu is False, then execution of eval on TPU is disabled.
-  if eval_on_tpu is True, but use_tpu is False, a warning is issued,
-  and TPU execution is disabled for all modes.
-
-  N.B. As `mode` is not immutable state in Estimator, but essential to
-  distinguish between TPU training and evaluation, a common usage for
-  _InternalTPUContext with `mode` is as follows:
-  ```
-  with _ctx.with_mode(mode) as ctx:
-    if ctx.is_running_on_cpu():
-       ...
-  ```
-  """
-
-  def __init__(self,
-               config,
-               train_batch_size,
-               eval_batch_size,
-               predict_batch_size,
-               use_tpu,
-               eval_on_tpu=True,
-               embedding_config_spec=None):
-    self._config = config
-    self._train_batch_size = train_batch_size
-    self._eval_batch_size = eval_batch_size
-    self._predict_batch_size = predict_batch_size
-    self._use_tpu = use_tpu
-    logging.info('_TPUContext: eval_on_tpu %s', eval_on_tpu)
-    if not use_tpu and eval_on_tpu:
-      logging.warning('eval_on_tpu ignored because use_tpu is False.')
-
-    self._eval_on_tpu = eval_on_tpu
-    self._model_parallelism_enabled = (
-        use_tpu and config.tpu_config.num_cores_per_replica)
-    self._mode = None
-    num_cores_per_replica = config.tpu_config.num_cores_per_replica
-    if self._model_parallelism_enabled:
-      self._computation_shape = _NUM_CORES_TO_COMPUTATION_SHAPE[
-          num_cores_per_replica]
-    else:
-      self._computation_shape = None
-    self._lazy_tpu_system_metadata_dict = {}  # key by master address
-    self._lazy_device_assignment_dict = {}  # key by master address
-    self._lazy_validation_dict = {}  # key by ModeKeys
-    self._embedding_config_spec = embedding_config_spec
-    self._lazy_embedding_config_dict = {}  # key by master address
-
-  def _assert_mode(self):
-    if self._mode is None:
-      raise RuntimeError(
-          '`mode` needs to be set via contextmanager `with_mode`.')
-    return self._mode
-
-  @contextmanager
-  def with_mode(self, mode):
-    # NOTE(xiejw): Shallow copy is enough. It will share he lazy dictionaries,
-    # such as _lazy_tpu_system_metadata_dict between new copy and the original
-    # one. Note that all lazy states stored in properties _lazy_foo are sort of
-    # immutable as they should be same for the process lifetime.
-    new_ctx = copy.copy(self)
-    new_ctx._mode = mode  # pylint: disable=protected-access
-    yield new_ctx
-
-  @property
-  def mode(self):
-    return self._assert_mode()
-
-  def _get_master_address(self):
-    mode = self._assert_mode()
-    config = self._config
-    master = (
-        config.master
-        if mode != model_fn_lib.ModeKeys.EVAL else config.evaluation_master)
-    return master
-
-  def _get_tpu_system_metadata(self):
-    """Gets the (maybe cached) TPU system metadata."""
-    master = self._get_master_address()
-    tpu_system_metadata = self._lazy_tpu_system_metadata_dict.get(master)
-    if tpu_system_metadata is not None:
-      return tpu_system_metadata
-
-    cluster_def = None
-    if (self._config.session_config and
-        self._config.session_config.cluster_def.job):
-      cluster_def = self._config.session_config.cluster_def
-
-    # pylint: disable=protected-access
-    tpu_system_metadata = (
-        tpu_system_metadata_lib._query_tpu_system_metadata(
-            master,
-            cluster_def=cluster_def,
-            query_topology=self.model_parallelism_enabled))
-
-    self._lazy_tpu_system_metadata_dict[master] = tpu_system_metadata
-    return tpu_system_metadata
-
-  def _get_device_assignment(self):
-    """Gets the (maybe cached) TPU device assignment."""
-    master = self._get_master_address()
-    device_assignment = self._lazy_device_assignment_dict.get(master)
-    if device_assignment is not None:
-      return device_assignment
-
-    tpu_system_metadata = self._get_tpu_system_metadata()
-
-    device_assignment = tpu_device_assignment.device_assignment(
-        tpu_system_metadata.topology,
-        computation_shape=self._computation_shape,
-        num_replicas=self.num_replicas)
-
-    logging.info('num_cores_per_replica: %s',
-                 str(self._config.tpu_config.num_cores_per_replica))
-    logging.info('computation_shape: %s', str(self._computation_shape))
-    logging.info('num_replicas: %d', self.num_replicas)
-    logging.info('device_assignment.topology.device_coordinates: %s',
-                 str(device_assignment.topology.device_coordinates))
-    logging.info('device_assignment.core_assignment: %s',
-                 str(device_assignment.core_assignment))
-
-    self._lazy_device_assignment_dict[master] = device_assignment
-    return device_assignment
-
-  @property
-  def embedding_config(self):
-    """Returns the embedding config based on current mode."""
-    master = self._get_master_address()
-    if master in self._lazy_embedding_config_dict:
-      embedding_config = self._lazy_embedding_config_dict[master]
-    else:
-      embedding_config = None
-      if self._use_tpu and self._embedding_config_spec:
-        embedding_config = _tpu_estimator_embedding.EmbeddingConfig(
-            self._embedding_config_spec, self._train_batch_size,
-            self._eval_batch_size, self.num_hosts, self.num_cores, self.config)
-        if not embedding_config.has_embedding_tables():
-          embedding_config = None
-      self._lazy_embedding_config_dict[master] = embedding_config
-
-    if embedding_config is not None:
-      mode = self._assert_mode()
-      # Dynamically attach tpu_embedding based on mode. With
-      # this, we could keep embedding_config immutable but call site always
-      # accesses the unified API '.tpu_embedding'.
-      embedding_config.tpu_embedding = embedding_config.get_tpu_embedding(mode)
-    return embedding_config
-
-  @property
-  def model_parallelism_enabled(self):
-    return self._model_parallelism_enabled
-
-  @property
-  def input_partition_dims(self):
-    return self._config.tpu_config.input_partition_dims
-
-  @property
-  def device_assignment(self):
-    return (self._get_device_assignment()
-            if self._model_parallelism_enabled else None)
-
-  @property
-  def num_of_cores_per_host(self):
-    metadata = self._get_tpu_system_metadata()
-    return metadata.num_of_cores_per_host
-
-  @property
-  def num_cores(self):
-    metadata = self._get_tpu_system_metadata()
-    return metadata.num_cores
-
-  @property
-  def num_of_replicas_per_host(self):
-    """Return the number of replicas per host."""
-    if self.model_parallelism_enabled:
-      return self.num_replicas // self.num_hosts
-    else:
-      return self.num_of_cores_per_host
-
-  @property
-  def num_replicas(self):
-    num_cores_in_system = self.num_cores
-
-    if self.model_parallelism_enabled:
-      num_cores_per_replica = self._config.tpu_config.num_cores_per_replica
-      if num_cores_per_replica > num_cores_in_system:
-        raise ValueError(
-            'The num of cores required by the model parallelism, specified by '
-            'TPUConfig.num_cores_per_replica, is larger than the total num of '
-            'TPU cores in the system. num_cores_per_replica: {}, num cores '
-            'in the system: {}'.format(num_cores_per_replica,
-                                       num_cores_in_system))
-
-      if num_cores_in_system % num_cores_per_replica != 0:
-        raise RuntimeError(
-            'The num of cores in the system ({}) is not divisible by the num '
-            'of cores ({}) required by the model parallelism, specified by '
-            'TPUConfig.num_cores_per_replica. This should never happen!'.format(
-                num_cores_in_system, num_cores_per_replica))
-
-      return num_cores_in_system // num_cores_per_replica
-    else:
-      return num_cores_in_system
-
-  @property
-  def num_hosts(self):
-    metadata = self._get_tpu_system_metadata()
-    return metadata.num_hosts
-
-  @property
-  def config(self):
-    return self._config
-
-  def is_input_sharded_per_core(self):
-    """Return true if input_fn is invoked per-core (other than per-host)."""
-    mode = self._assert_mode()
-    return (mode == model_fn_lib.ModeKeys.TRAIN and
-            (self._config.tpu_config.per_host_input_for_training is
-             tpu_config.InputPipelineConfig.PER_SHARD_V1))
-
-  def is_input_per_host_with_iterators(self):
-    """Return true if input_fn should be run in the per-host v2 config."""
-    return (self._config.tpu_config.per_host_input_for_training is
-            tpu_config.InputPipelineConfig.PER_HOST_V2)
-
-  def is_input_broadcast_with_iterators(self):
-    """Return true if input_fn should be run in the full_replicae config."""
-    mode = self._assert_mode()
-    return ((self._config.tpu_config.per_host_input_for_training is
-             tpu_config.InputPipelineConfig.BROADCAST) or
-            (mode != model_fn_lib.ModeKeys.TRAIN and
-             self._config.tpu_config.eval_training_input_configuration is
-             tpu_config.InputPipelineConfig.SLICED))
-
-  def is_running_on_cpu(self, is_export_mode=False):
-    """Determines whether the input_fn and model_fn should be invoked on CPU.
-
-    This API also validates user provided configuration, such as batch size,
-    according the lazy initialized TPU system metadata.
-
-    Args:
-      is_export_mode: Indicates whether the current mode is for exporting the
-        model, when mode == PREDICT. Only with this bool, we could
-        tell whether user is calling the Estimator.predict or
-        Estimator.export_savedmodel, which are running on TPU and CPU
-        respectively. Parent class Estimator does not distinguish these two.
-
-    Returns:
-      bool, whether current input_fn or model_fn should be running on CPU.
-
-    Raises:
-      ValueError: any configuration is invalid.
-    """
-
-    is_running_on_cpu = self._is_running_on_cpu(is_export_mode)
-    if not is_running_on_cpu:
-      self._validate_tpu_configuration()
-    return is_running_on_cpu
-
-  def _is_running_on_cpu(self, is_export_mode):
-    """Determines whether the input_fn and model_fn should be invoked on CPU."""
-    mode = self._assert_mode()
-
-    if not self._use_tpu:
-      return True
-
-    if mode == model_fn_lib.ModeKeys.EVAL and not self._eval_on_tpu:
-      logging.info('_is_running_on_cpu: eval_on_tpu disabled')
-      return True
-
-    if is_export_mode:
-      return True
-
-    return False
-
-  @property
-  def global_batch_size(self):
-    mode = self._assert_mode()
-    if mode == model_fn_lib.ModeKeys.TRAIN:
-      return self._train_batch_size
-    elif mode == model_fn_lib.ModeKeys.EVAL:
-      return self._eval_batch_size
-    elif mode == model_fn_lib.ModeKeys.PREDICT:
-      return self._predict_batch_size
-    else:
-      return None
-
-  @property
-  def batch_size_for_input_fn(self):
-    """Returns the shard batch size for `input_fn`."""
-    global_batch_size = self.global_batch_size
-    if (self.is_running_on_cpu() or self.is_input_broadcast_with_iterators()):
-      return global_batch_size
-
-    # On TPU
-    if self.is_input_sharded_per_core() or (
-        self.is_input_per_host_with_iterators()):
-      return global_batch_size // self.num_replicas
-    else:
-      return global_batch_size // self.num_hosts
-
-  @property
-  def batch_size_for_model_fn(self):
-    """Returns the shard batch size for `model_fn`."""
-    global_batch_size = self.global_batch_size
-
-    if (self.is_running_on_cpu() or self.is_input_broadcast_with_iterators()):
-      return global_batch_size
-
-    # On TPU. always sharded per shard.
-    return global_batch_size // self.num_replicas
-
-  @property
-  def master_job(self):
-    """Returns the job name to use to place TPU computations on.
-
-    Returns:
-      A string containing the job name, or None if no job should be specified.
-
-    Raises:
-      ValueError: If the user needs to specify a tpu_job_name, because we are
-        unable to infer the job name automatically, or if the user-specified job
-        names are inappropriate.
-    """
-    run_config = self._config
-    # If the user specifies the tpu_job_name, use that.
-    if run_config.tpu_config.tpu_job_name:
-      return run_config.tpu_config.tpu_job_name
-
-    # The tpu job is determined by the run_config. Right now, this method is
-    # required as tpu_config is not part of the RunConfig.
-    mode = self._assert_mode()
-    master = (
-        run_config.evaluation_master
-        if mode == model_fn_lib.ModeKeys.EVAL else run_config.master)
-    cluster_def = (run_config.session_config.cluster_def
-                   if run_config.session_config else None)
-
-    return tpu_system_metadata_lib.master_job(master, cluster_def)
-
-  @property
-  def tpu_host_placement_function(self):
-    """Returns the TPU host place function."""
-
-    master = self.master_job
-
-    def _placement_function(_sentinal=None, replica_id=None, host_id=None):  # pylint: disable=invalid-name
-      """Return the host device given replica_id or host_id."""
-      assert _sentinal is None
-      if replica_id is not None and host_id is not None:
-        raise RuntimeError(
-            'replica_id and host_id can have only one non-None value.')
-
-      if master is None:
-        return '/replica:0/task:0/device:CPU:0'
-      else:
-        if replica_id is not None:
-          if self.model_parallelism_enabled:
-            return self.device_assignment.host_device(
-                replica=replica_id, job=master)
-          else:
-            host_id = replica_id / self.num_of_cores_per_host
-
-        return '/job:%s/task:%d/device:CPU:0' % (master, host_id)
-
-    return _placement_function
-
-  @property
-  def tpu_device_placement_function(self):
-    """Returns a TPU device placement Fn."""
-    master = self.master_job
-    job_device = '' if master is None else ('/job:%s' % master)
-
-    def _placement_function(i):
-      if self.model_parallelism_enabled:
-        return self.device_assignment.tpu_device(replica=i, job=master)
-      else:
-        num_of_cores_per_host = self.num_of_cores_per_host
-        host_id = i / num_of_cores_per_host
-        ordinal_id = i % num_of_cores_per_host
-        return '%s/task:%d/device:TPU:%d' % (job_device, host_id, ordinal_id)
-
-    return _placement_function
-
-  def tpu_ordinal_function(self, host_id):
-    """Returns the TPU ordinal fn."""
-
-    def _tpu_ordinal_function(shard_index_in_host):
-      """Return the TPU ordinal associated with a shard.
-
-      Required because the enqueue ops are placed on CPU.
-
-      Args:
-        shard_index_in_host: the shard index
-
-      Returns:
-        The ordinal of the TPU device the shard's infeed should be placed on.
-      """
-      if self.model_parallelism_enabled:
-        # We put both enqueue/dequeue ops at tpu.core(0) in each replica.
-        replica = self.device_assignment.lookup_replicas(host_id,
-                                                         0)[shard_index_in_host]
-        return self.device_assignment.tpu_ordinal(replica=replica)
-      else:
-        return shard_index_in_host % self.num_of_cores_per_host
-
-    return _tpu_ordinal_function
-
-  def _validate_tpu_configuration(self):
-    """Validates the configuration based on the TPU system metadata."""
-    mode = self._assert_mode()
-    if self._lazy_validation_dict.get(mode):
-      return
-
-    # All following information is obtained from TPU system metadata.
-    num_cores = self.num_cores
-    num_replicas = self.num_replicas
-    num_hosts = self.num_hosts
-
-    if not num_cores:
-      tpu_system_metadata = self._get_tpu_system_metadata()
-      raise RuntimeError(
-          'Cannot find any TPU cores in the system. Please double check '
-          'Tensorflow master address and TPU worker(s). Available devices '
-          'are {}.'.format(tpu_system_metadata.devices))
-
-    if self._config.tpu_config.num_shards:
-      user_provided_num_replicas = self._config.tpu_config.num_shards
-      if user_provided_num_replicas != num_replicas:
-        message = (
-            'TPUConfig.num_shards is not set correctly. According to TPU '
-            'system metadata for Tensorflow master ({}): num_replicas should '
-            'be ({}), got ({}). For non-model-parallelism, num_replicas should '
-            'be the total num of TPU cores in the system. For '
-            'model-parallelism, the total number of TPU cores should be '
-            'num_cores_per_replica * num_replicas. Please set it '
-            'accordingly or leave it as `None`'.format(
-                self._get_master_address(), num_replicas,
-                user_provided_num_replicas))
-
-        raise ValueError(message)
-
-    if self._config.tpu_config.num_cores_per_replica:
-      num_cores_per_replica = self._config.tpu_config.num_cores_per_replica
-      num_cores_per_host = self._get_tpu_system_metadata().num_of_cores_per_host
-      if num_cores_per_replica > num_cores_per_host:
-        raise ValueError(
-            'The num of cores required by the model parallelism, specified by '
-            'TPUConfig.num_cores_per_replica, is larger than the '
-            'num_cores_per_host. num_cores_per_replica: {}, '
-            'num_cores_per_host: {}'.format(num_cores_per_replica,
-                                            num_cores_per_host))
-
-    if mode == model_fn_lib.ModeKeys.TRAIN:
-      if (self._train_batch_size % num_replicas != 0 and
-          not self.is_input_broadcast_with_iterators()):
-        raise ValueError(
-            'train batch size {} must be divisible by number of replicas {}'
-            .format(self._train_batch_size, num_replicas))
-
-    elif mode == model_fn_lib.ModeKeys.EVAL:
-      if self._eval_batch_size is None:
-        raise ValueError(
-            'eval_batch_size in TPUEstimator constructor cannot be `None`'
-            'if .evaluate is running on TPU.')
-      if (self._eval_batch_size % num_replicas != 0 and
-          not self.is_input_broadcast_with_iterators()):
-        raise ValueError(
-            'eval batch size {} must be divisible by number of replicas {}'
-            .format(self._eval_batch_size, num_replicas))
-      if num_hosts > 1 and not self.is_input_broadcast_with_iterators():
-        raise ValueError(
-            'TPUEstimator.evaluate should be running on single TPU'
-            ' instead of a Pod.')
-    else:
-      assert mode == model_fn_lib.ModeKeys.PREDICT
-      if self._predict_batch_size is None:
-        raise ValueError(
-            'predict_batch_size in TPUEstimator constructor should not be '
-            '`None` if .predict is running on TPU.')
-      if (self._predict_batch_size % num_replicas != 0 and
-          not self.is_input_broadcast_with_iterators()):
-        raise ValueError(
-            'predict batch size {} must be divisible by number of replicas {}'
-            .format(self._predict_batch_size, num_replicas))
-      if num_hosts > 1 and not self.is_input_broadcast_with_iterators():
-        raise ValueError(
-            'TPUEstimator.predict should be running on single TPU worker. '
-            'got {}.'.format(num_hosts))
-
-    # Record the state "validated" into lazy dictionary.
-    self._lazy_validation_dict[mode] = True
-
-  def device_for_replica(self, replica_id):
-    """Returns the tuple of (CPU device and device ordinal) for replica.
-
-    This should be used for full replicate for non-model-parallelism.
-
-    Args:
-       replica_id: Int, the replica index.
-
-    Returns:
-       A tuple of device spec for CPU device and int device ordinal.
-    """
-    master = self.master_job
-
-    if self.model_parallelism_enabled:
-      return (self.device_assignment.host_device(
-          replica=replica_id, job=master),
-              self.device_assignment.tpu_ordinal(replica=replica_id))
-
-    job_device = '' if master is None else ('/job:%s' % master)
-
-    num_of_replicas_per_host = self.num_of_replicas_per_host
-    host_id = replica_id / num_of_replicas_per_host
-    ordinal_id = replica_id % num_of_replicas_per_host
-
-    host_device = '%s/task:%d/device:CPU:0' % (job_device, host_id)
-    return (host_device, ordinal_id)
-
-
-class _OneCoreTPUContext(_InternalTPUContext):
-  """Special _InternalTPUContext for one core usage."""
-
-  def __init__(self, config, train_batch_size, eval_batch_size,
-               predict_batch_size, use_tpu):
-
-    super(_OneCoreTPUContext, self).__init__(
-        config, train_batch_size, eval_batch_size,
-        predict_batch_size, use_tpu)
-
-  def _get_tpu_system_metadata(self):
-    """Gets the (maybe cached) TPU system metadata."""
-    master = self._get_master_address()
-    tpu_system_metadata = self._lazy_tpu_system_metadata_dict.get(master)
-    if tpu_system_metadata is not None:
-      return tpu_system_metadata
-
-    tpu_system_metadata = (
-        tpu_system_metadata_lib._TPUSystemMetadata(  # pylint: disable=protected-access
-            num_cores=1,
-            num_hosts=1,
-            num_of_cores_per_host=1,
-            topology=None,
-            devices=[]))
-
-    self._lazy_tpu_system_metadata_dict[master] = tpu_system_metadata
-    return tpu_system_metadata
-
-
-def _get_tpu_context(config, train_batch_size, eval_batch_size,
-                     predict_batch_size, use_tpu, eval_on_tpu,
-                     embedding_config_spec):
-  """Returns an instance of `_InternalTPUContext`."""
-
-  if (config.tpu_config.num_shards == 1 and
-      config.tpu_config.num_cores_per_replica is None):
-    if embedding_config_spec is not None:
-      raise ValueError('Setting TPUConfig.num_shards==1 is unsupported '
-                       'when embedding_config_spec is not None.')
-    logging.warning(
-        'Setting TPUConfig.num_shards==1 is an unsupported behavior. '
-        'Please fix as soon as possible (leaving num_shards as None.)')
-    return _OneCoreTPUContext(config, train_batch_size, eval_batch_size,
-                              predict_batch_size, use_tpu)
-
-  return _InternalTPUContext(config, train_batch_size, eval_batch_size,
-                             predict_batch_size, use_tpu, eval_on_tpu,
-                             embedding_config_spec)
+# pylint: disable=wildcard-import,unused-import
+from tensorflow_estimator.python.estimator.tpu.tpu_context import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/python/tpu/tpu_embedding.py b/tensorflow/python/tpu/tpu_embedding.py
index 4a1f94ff333..d3f23a10cee 100644
--- a/tensorflow/python/tpu/tpu_embedding.py
+++ b/tensorflow/python/tpu/tpu_embedding.py
@@ -34,6 +34,7 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
 from tensorflow.python.tpu.ops import tpu_ops
 
@@ -606,7 +607,10 @@ class TPUEmbedding(object):
       table_descriptor.name = table
 
       table_config = self._table_to_config_dict[table]
-      table_descriptor.vocabulary_size = table_config.vocabulary_size
+      # For small tables, we pad to the number of hosts so that at least one
+      # id will be assigned to each host.
+      table_descriptor.vocabulary_size = max(table_config.vocabulary_size,
+                                             len(self.hosts))
       table_descriptor.dimension = table_config.dimension
 
       table_descriptor.num_features = self._table_to_num_features_dict[table]
@@ -1278,14 +1282,19 @@ def _create_device_fn(hosts):
   def device_fn(op):
     """Returns the `device` for `op`."""
     part_match = re.match(r'.*/part_(\d+)(/|$)', op.name)
+    dummy_match = re.match(r'.*dummy_(\d+).*', op.name)
+    if not part_match and not dummy_match:
+      raise RuntimeError(
+          'Internal Error: Expected {} to contain /part_* or dummy_*'.format(
+              op.name))
 
     if part_match:
       idx = int(part_match.group(1))
     else:
-      raise RuntimeError('Internal Error: '
-                         'Expected %s to contain /part_*.' % op.name)
+      idx = int(dummy_match.group(1))
 
     device = hosts[idx]
+    logging.debug('assigning {} to {}.', op, device)
     return device
 
   return device_fn
@@ -1298,17 +1307,31 @@ def _create_partitioned_variables(name,
                                   initializer,
                                   collections=None):  # pylint: disable=redefined-outer-name
   """Creates ParitionedVariables based on `num_hosts` for `table`."""
-  # TODO(shizhiw): automatically place embedding lookup elsewhere?
-  if vocabulary_size < num_hosts:
-    raise ValueError('`vocabulary_size`({}) is smaller than `num_hosts`({}). '
-                     'As TPU embedding is not optimized for small tables, '
-                     'please consider other ways for this embedding lookup.')
 
-  return list(variable_scope.get_variable(
-      name,
-      shape=(vocabulary_size, embedding_dimension),
-      partitioner=partitioned_variables.fixed_size_partitioner(num_hosts),
-      dtype=dtypes.float32,
-      initializer=initializer,
-      collections=collections,
-      trainable=False))
+  num_slices = min(vocabulary_size, num_hosts)
+
+  var_list = list(
+      variable_scope.get_variable(
+          name,
+          shape=(vocabulary_size, embedding_dimension),
+          partitioner=partitioned_variables.fixed_size_partitioner(num_slices),
+          dtype=dtypes.float32,
+          initializer=initializer,
+          collections=collections,
+          trainable=False))
+
+  if vocabulary_size >= num_hosts:
+    return var_list
+
+  # For padded part, define the dummy variable to be loaded into TPU system.
+  for idx in range(num_hosts - vocabulary_size):
+    var_list.append(
+        variable_scope.get_variable(
+            'dummy_{}_{}'.format(vocabulary_size + idx, name),
+            shape=(1, embedding_dimension),
+            dtype=dtypes.float32,
+            initializer=initializer,
+            collections=[ops.GraphKeys.LOCAL_VARIABLES],
+            trainable=False))
+
+  return var_list
diff --git a/tensorflow/python/tpu/tpu_estimator.py b/tensorflow/python/tpu/tpu_estimator.py
index 0c231e29ae9..0ee490681e4 100644
--- a/tensorflow/python/tpu/tpu_estimator.py
+++ b/tensorflow/python/tpu/tpu_estimator.py
@@ -1,4111 +1,33 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-"""TPUEstimator class."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import copy
-import enum
-import os
-import signal
-import sys
-import threading
-import time
-
-import numpy as np
-import six
-from six.moves import queue as Queue  # pylint: disable=redefined-builtin
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-from tensorflow.core.framework import variable_pb2
-from tensorflow.core.framework.summary_pb2 import Summary
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.core.protobuf.tpu import compilation_result_pb2 as tpu_compilation_result
-from tensorflow.python.client import session as tf_session
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest as data_nest
-from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
-from tensorflow.python.estimator import estimator as estimator_lib
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator.export import export_output as export_output_lib
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import function
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import batch_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import summary_ops_v2 as contrib_summary
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.saved_model import tag_constants
-from tensorflow.python.summary import summary
-from tensorflow.python.tpu import _tpu_estimator_embedding
-from tensorflow.python.tpu import error_handling
-from tensorflow.python.tpu import functional as tpu_functional
-from tensorflow.python.tpu import preempted_hook
-from tensorflow.python.tpu import session_support
-from tensorflow.python.tpu import tensor_tracer
-from tensorflow.python.tpu import tpu
-from tensorflow.python.tpu import tpu_config
-from tensorflow.python.tpu import tpu_context
-from tensorflow.python.tpu import tpu_embedding_gradient
-from tensorflow.python.tpu import tpu_feed
-from tensorflow.python.tpu import tpu_function
-from tensorflow.python.tpu import training_loop
-from tensorflow.python.tpu import util as util_lib
-from tensorflow.python.tpu._tpu_estimator_embedding import AdagradParameters  # pylint: disable=unused-import
-from tensorflow.python.tpu._tpu_estimator_embedding import AdamParameters  # pylint: disable=unused-import
-from tensorflow.python.tpu._tpu_estimator_embedding import StochasticGradientDescentParameters  # pylint: disable=unused-import
-from tensorflow.python.tpu._tpu_estimator_embedding import EmbeddingConfigSpec  # pylint: disable=unused-import
-from tensorflow.python.tpu.ops import tpu_ops
-from tensorflow.python.training import basic_session_run_hooks
-from tensorflow.python.training import evaluation
-from tensorflow.python.training import session_run_hook
-from tensorflow.python.training import training
-from tensorflow.python.training import training_util
-from tensorflow.python.util import function_utils
-from tensorflow.python.util import nest
-from tensorflow.python.util import tf_inspect
-
-_INITIAL_LOSS = 1e7
-_ZERO_LOSS = 0.
-_TPU_ESTIMATOR = 'tpu_estimator'
-_ITERATIONS_PER_LOOP_VAR = 'iterations_per_loop'
-_BATCH_SIZE_KEY = 'batch_size'
-_CTX_KEY = 'context'
-_USE_TPU_KEY = 'use_tpu'
-_CROSS_REPLICA_SUM_OP = 'CrossReplicaSum'
-_ONE_GIGABYTE = 1024 * 1024 * 1024
-_TPU_ENQUEUE_OPS = '_tpu_enqueue_ops'
-_TPU_TRAIN_OP = '_tpu_train_op'
-_INFERENCE_ON_TPU_MODE = '_inference_on_tpu'
-_KEY_WHEN_PREDICTIONS_IS_A_TENSOR = '_key_when_predictions_is_a_tensor'
-
-# Ideally _USE_TPU_KEY should be reserved as well. However there are already
-# models that make use of this key, thus it can not be reserved now to prevent
-# breakage. In the long run, we would like to mitigate this by migrating models
-# off of using _USE_TPU_KEY.
-_RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY, _CTX_KEY]
-
-# TODO(b/65703635): Flip the value and remove all dead code. Currently, this is
-# only used for per-core based deployments. For per-host based pipelines, if a
-# user returns a Dataset instance it will be automatically wrapped in a
-# tf.while_loop (This can be disabled by returning features and labels
-# explicitly).
-_WRAP_INPUT_FN_INTO_WHILE_LOOP = False
-
-if ops.get_to_proto_function(
-    '{}_{}'.format(_TPU_ESTIMATOR, _ITERATIONS_PER_LOOP_VAR)) is None:
-  ops.register_proto_function(
-      '{}_{}'.format(_TPU_ESTIMATOR, _ITERATIONS_PER_LOOP_VAR),
-      proto_type=variable_pb2.VariableDef,
-      to_proto=resource_variable_ops._to_proto_fn,  # pylint: disable=protected-access
-      from_proto=resource_variable_ops._from_proto_fn)  # pylint: disable=protected-access
-
-
-def _is_iterable(obj):
-  """A Python 2 and 3 compatible util to check whether `obj` is iterable."""
-  try:
-    iter(obj)
-    return True
-  except TypeError:
-    return False
-
-
-class CatchInvalidHostcallFunctions(control_flow_ops.XLAControlFlowContext):
-
-  def AddOp(self, op):
-    if op.type in [
-        'AudioSummary', 'AudioSummaryV2', 'HistogramSummary', 'ImageSummary',
-        'MergeSummary', 'ScalarSummary', 'TensorSummary', 'TensorSummaryV2'
-    ]:
-      raise ValueError('Use tf.contrib.summary inside of host_calls.')
-
-
-def _create_global_step(graph):
-  graph = graph or ops.get_default_graph()
-  if training.get_global_step(graph) is not None:
-    raise ValueError('"global_step" already exists.')
-  # Create in proper graph and base name_scope.
-  with graph.as_default() as g, g.name_scope(None):
-    return variable_scope.get_variable(
-        ops.GraphKeys.GLOBAL_STEP,
-        shape=[],
-        dtype=dtypes.int64,
-        initializer=init_ops.zeros_initializer(),
-        trainable=False,
-        use_resource=True,
-        collections=[ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.GLOBAL_STEP])
-
-
-def _create_or_get_iterations_per_loop():
-  """Creates or gets the iterations_per_loop variable.
-
-  In TPUEstimator, the user provided computation, the model_fn, is wrapped
-  inside a tf.while_loop for peak performance. The iterations of the loop are
-  specified by this variable, which adjusts its value on the CPU after each TPU
-  program execution and before the next TPU execution.
-
-  The purpose of using a variable, rather then a constant, is to allow
-  TPUEstimator adapt the TPU training iterations according to the final steps
-  specified by users. For example, if the user sets the iterations_per_loop as 4
-  in TPUConfig and steps as 10 in TPUEstimator.train(), the iterations_per_loop
-  variable will have the following value before each TPU training.
-
-      - 1-th TPU execution: iterations_per_loop = 4
-      - 2-th TPU execution: iterations_per_loop = 4
-      - 3-th TPU execution: iterations_per_loop = 2
-
-  As model_fn increases the global step once per train_op invocation, the global
-  step is 10 after all TPU executions, matching the steps=10 inputs passed in by
-  users.
-
-  Returns:
-    A TF non-trainable resource variable.
-
-  Raises:
-    RuntimeError: If multi iterations_per_loop variables were found.
-  """
-  graph = ops.get_default_graph()
-  collection_name = '{}_{}'.format(_TPU_ESTIMATOR, _ITERATIONS_PER_LOOP_VAR)
-  iter_vars = graph.get_collection(collection_name)
-  if len(iter_vars) == 1:
-    return iter_vars[0]
-  elif len(iter_vars) > 1:
-    raise RuntimeError('Multiple iterations_per_loop_var in collection.')
-
-  with ops.colocate_with(training_util.get_global_step()):
-    with variable_scope.variable_scope(
-        _TPU_ESTIMATOR, reuse=variable_scope.AUTO_REUSE):
-      return variable_scope.get_variable(
-          _ITERATIONS_PER_LOOP_VAR,
-          initializer=init_ops.zeros_initializer(),
-          shape=[],
-          dtype=dtypes.int32,
-          trainable=False,
-          collections=[collection_name, ops.GraphKeys.LOCAL_VARIABLES],
-          use_resource=True)
-
-
-def _sync_variables_ops(ctx):
-  """Create varriables synchronization ops.
-
-  Gets the variables back from TPU nodes. This means the variables updated
-  by TPU will now be *synced* to host memory.
-  In BROADCAST mode, we skip this sync since the variables are ususally too
-  big to transmit via RPC.
-
-  Args:
-    ctx: A `_InternalTPUContext` instance with mode.
-
-  Returns:
-    A list of sync ops.
-  """
-
-  if not ctx.is_input_broadcast_with_iterators():
-    return [
-        array_ops.check_numerics(v.read_value(),
-                                 'Gradient for %s is NaN' % v.name).op
-        for v in variables.trainable_variables()
-    ]
-  else:
-    return [control_flow_ops.no_op()]
-
-
-def _increase_eval_step_op(iterations_per_loop):
-  """Returns an op to increase the eval step for TPU evaluation.
-
-  Args:
-    iterations_per_loop: Tensor. The number of eval steps running in TPU system
-      before returning to CPU host for each `Session.run`.
-
-  Returns:
-    An operation
-  """
-  eval_step = evaluation._get_or_create_eval_step()  # pylint: disable=protected-access
-  # Estimator evaluate increases 1 by default. So, we increase the difference.
-  return state_ops.assign_add(
-      eval_step,
-      math_ops.cast(iterations_per_loop - 1, dtype=eval_step.dtype),
-      use_locking=True)
-
-
-def _extract_key_names(tensor_or_dict):
-  if isinstance(tensor_or_dict, dict):
-    return sorted(tensor_or_dict.keys())
-  return []
-
-
-class PeriodicLogger(object):
-
-  def __init__(self, seconds):
-    self._log_every_n_seconds = seconds
-    self._last_log_time = 0
-
-  def log(self, msg, *args, **kw):
-    if time.time() - self._last_log_time > self._log_every_n_seconds:
-      self._last_log_time = time.time()
-      logging.info(msg, *args, **kw)
-
-
-class _SIGNAL(object):
-  """Signal used to control the thread of infeed/outfeed.
-
-  All preserved signals must be negative numbers. Positive numbers are used to
-  indicate the number of iterations for next training/evaluation loop.
-  """
-  NEXT_BATCH = -1
-  STOP = -2
-
-
-class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
-  """Ops and objects returned from a `model_fn` and passed to `TPUEstimator`.
-
-  See `EstimatorSpec` for `mode`, `predictions`, `loss`, `train_op`, and
-  `export_outputs`.
-
-  For evaluation, `eval_metrics `is a tuple of `metric_fn` and `tensors`, where
-  `metric_fn` runs on CPU to generate metrics and `tensors` represents the
-  `Tensor`s transferred from TPU system to CPU host and passed to `metric_fn`.
-  To be precise, TPU evaluation expects a slightly different signature from the
-  `tf.estimator.Estimator`. While `EstimatorSpec.eval_metric_ops` expects a
-  dict, `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`.
-  The `tensors` could be a list of `Tensor`s or dict of names to `Tensor`s. The
-  `tensors` usually specify the model logits, which are transferred back from
-  TPU system to CPU host. All tensors must have be batch-major, i.e., the batch
-  size is the first dimension. Once all tensors are available at CPU host from
-  all shards, they are concatenated (on CPU) and passed as positional arguments
-  to the `metric_fn` if `tensors` is list or keyword arguments if `tensors` is
-  a dict. `metric_fn` takes the `tensors` and returns a dict from metric string
-  name to the result of calling a metric function, namely a `(metric_tensor,
-  update_op)` tuple. See `TPUEstimator` for MNIST example how to specify the
-  `eval_metrics`.
-
-  `scaffold_fn` is a function running on CPU to generate the `Scaffold`. This
-  function should not capture any Tensors in `model_fn`.
-
-  `host_call` is a tuple of a `function` and a list or dictionary of `tensors`
-  to pass to that function and returns a list of Tensors. `host_call` currently
-  works for train() and evaluate(). The Tensors returned by the function is
-  executed on the CPU on every step, so there is communication overhead when
-  sending tensors from TPU to CPU. To reduce the overhead, try reducing the
-  size of the tensors. The `tensors` are concatenated along their major (batch)
-  dimension, and so must be >= rank 1. The `host_call` is useful for writing
-  summaries with `tf.contrib.summary.create_file_writer`.
-  """
-
-  def __new__(cls,
-              mode,
-              predictions=None,
-              loss=None,
-              train_op=None,
-              eval_metrics=None,
-              export_outputs=None,
-              scaffold_fn=None,
-              host_call=None,
-              training_hooks=None,
-              evaluation_hooks=None,
-              prediction_hooks=None):
-    """Creates a validated `TPUEstimatorSpec` instance."""
-    host_calls = {}
-    if eval_metrics is not None:
-      host_calls['eval_metrics'] = eval_metrics
-    if host_call is not None:
-      host_calls['host_call'] = host_call
-    _OutfeedHostCall.validate(host_calls)
-
-    training_hooks = tuple(training_hooks or [])
-    evaluation_hooks = tuple(evaluation_hooks or [])
-    prediction_hooks = tuple(prediction_hooks or [])
-
-    for hook in training_hooks + evaluation_hooks + prediction_hooks:
-      if not isinstance(hook, session_run_hook.SessionRunHook):
-        raise TypeError('All hooks must be SessionRunHook instances, given: {}'
-                        .format(hook))
-
-    return super(TPUEstimatorSpec, cls).__new__(
-        cls,
-        mode=mode,
-        predictions=predictions,
-        loss=loss,
-        train_op=train_op,
-        eval_metrics=eval_metrics,
-        export_outputs=export_outputs,
-        scaffold_fn=scaffold_fn,
-        host_call=host_call,
-        training_hooks=training_hooks,
-        evaluation_hooks=evaluation_hooks,
-        prediction_hooks=prediction_hooks)
-
-  def as_estimator_spec(self):
-    """Creates an equivalent `EstimatorSpec` used by CPU train/eval."""
-    host_calls = {}
-    if self.eval_metrics is not None:
-      host_calls['eval_metrics'] = self.eval_metrics
-    if self.host_call is not None:
-      host_calls['host_call'] = self.host_call
-    host_call_ret = _OutfeedHostCall.create_cpu_hostcall(host_calls)
-    eval_metric_ops = None
-    if self.eval_metrics is not None:
-      eval_metric_ops = host_call_ret['eval_metrics']
-    hooks = None
-    if self.host_call is not None:
-      hooks = [_OutfeedHostCallHook(host_call_ret['host_call'])]
-    loss = self.loss
-    if tensor_tracer.TensorTracer.is_enabled() \
-       and self.train_op is not None:
-      tt = tensor_tracer.TensorTracer()
-      loss = tt.trace_cpu(ops.get_default_graph(), loss, self.train_op)
-
-    hooks = tuple(hooks or [])
-    scaffold = self.scaffold_fn() if self.scaffold_fn else None
-    return model_fn_lib.EstimatorSpec(
-        mode=self.mode,
-        predictions=self.predictions,
-        loss=loss,
-        train_op=self.train_op,
-        eval_metric_ops=eval_metric_ops,
-        export_outputs=self.export_outputs,
-        scaffold=scaffold,
-        training_hooks=self.training_hooks + hooks,
-        evaluation_hooks=self.evaluation_hooks + hooks,
-        prediction_hooks=self.prediction_hooks + hooks)
-
-
-class _OpQueueContext(object):
-  """Manages work queue and thread for a infeed/outfeed thread."""
-
-  def __init__(self, name, target, args):
-    self._name = name
-    self._queue = Queue.Queue()
-    args = (self,) + args
-    self._thread = threading.Thread(name=name, target=target, args=args)
-    self._thread.daemon = True
-    self._thread.start()
-
-  def stop(self):
-    self._queue.put(_SIGNAL.STOP)
-
-  def send_next_batch_signal(self, iterations):
-    self._queue.put(iterations)
-
-  def read_iteration_counts(self):
-    while True:
-      iterations = self._queue.get(block=True)
-      logging.debug('%s read iterations %s', self._name, iterations)
-      if iterations == _SIGNAL.STOP:
-        logging.info('%s received shutdown signal, stopping.', self._name)
-        return
-      yield iterations
-
-  def join(self):
-    logging.info('Shutting down %s thread.', self._name)
-    self.stop()
-    self._thread.join()
-
-
-class _OpSignalOnceQueueContext(_OpQueueContext):
-  """Manages work queue and thread for a infeed/outfeed thread.
-
-  This subclass only signals once.
-  """
-
-  def __init__(self, name, target, args):
-    super(_OpSignalOnceQueueContext, self).__init__(name, target, args)
-    self._has_signaled = False
-
-  def send_next_batch_signal(self, iterations):
-    if not self._has_signaled:
-      self._queue.put(iterations)
-      self._has_signaled = True
-
-
-class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
-  """A Session hook setting up the TPU initialization, infeed, and outfeed.
-
-  This hook does two major things:
-  1. initialize and shutdown TPU system.
-  2. launch and join the threads for infeed enqueue and (optional) outfeed
-     dequeue.
-  """
-
-  def __init__(self,
-               ctx,
-               enqueue_ops,
-               dequeue_ops,
-               tpu_compile_op,
-               run_infeed_loop_on_coordinator=True,
-               rendezvous=None,
-               master=None,
-               session_config=None,
-               tpu_init_ops=None):
-    self._master_job = ctx.master_job
-    self._enqueue_ops = enqueue_ops
-    self._dequeue_ops = dequeue_ops
-    self._rendezvous = rendezvous
-    self._master = master
-    self._session_config = session_config
-    self._init_ops = list(tpu_init_ops or [])
-    if ctx.embedding_config is None:
-      self._embedding_layer_config = None
-    else:
-      self._embedding_layer_config = (
-          ctx.embedding_config.tpu_embedding.config_proto)
-    self._run_infeed_loop_on_coordinator = run_infeed_loop_on_coordinator
-    self._initial_infeed_sleep_secs = (
-        ctx.config.tpu_config.initial_infeed_sleep_secs)
-
-    # When using model parallelism, the TPU is pre-initialized at startup to
-    # fetch mesh information. We skip re-initializing it here for
-    # MeshTensorFlow since it places variables on TPU directly. Reinitialize tpu
-    # is causing the variable corruption since the previous allocated memory
-    # might be overwritten for other purpose.
-    if (ctx.model_parallelism_enabled and
-        (ctx.config.tpu_config.per_host_input_for_training is
-         tpu_config.InputPipelineConfig.BROADCAST)):
-      self._should_initialize_tpu = False
-    else:
-      self._should_initialize_tpu = True
-
-    self._tpu_compile_op = tpu_compile_op
-
-  def begin(self):
-    logging.info('TPU job name %s', self._master_job)
-    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
-    if self._should_initialize_tpu:
-      self._finalize_ops = [tpu.shutdown_system(job=self._master_job)]
-    else:
-      self._finalize_ops = []
-
-    summary_writer_init_ops = contrib_summary.summary_writer_initializer_op()
-    self._init_ops.extend(summary_writer_init_ops)
-    # Get all the writer resources from the initializer, so we know what to
-    # flush.
-    for op in summary_writer_init_ops:
-      self._finalize_ops.append(contrib_summary.flush(writer=op.inputs[0]))
-
-  def _run_infeed(self, queue_ctx, session):
-    logging.info('Starting infeed thread controller.')
-    if self._initial_infeed_sleep_secs:
-      logging.info('Infeed thread sleeping for %d seconds.',
-                   self._initial_infeed_sleep_secs)
-      time.sleep(self._initial_infeed_sleep_secs)
-      logging.info('Infeed thread starting after sleep')
-
-    with self._rendezvous.catch_errors(source='infeed', session=session):
-      if self._run_infeed_loop_on_coordinator:
-        for count, steps in enumerate(queue_ctx.read_iteration_counts()):
-          for i in xrange(steps):
-            logging.debug('Infeed enqueue for iteration (%d, %d)', count, i)
-            session.run(self._enqueue_ops)
-      else:
-        for _ in queue_ctx.read_iteration_counts():
-          session.run(self._enqueue_ops)
-      logging.info('Infeed thread finished, shutting down.')
-
-  def _run_outfeed(self, queue_ctx, session):
-    logging.info('Starting outfeed thread controller.')
-    status_logger = PeriodicLogger(seconds=60)
-    with self._rendezvous.catch_errors(source='outfeed', session=session):
-      for count, steps in enumerate(queue_ctx.read_iteration_counts()):
-        for i in xrange(steps):
-          logging.debug('Outfeed dequeue for iteration (%d, %d)', count, i)
-          session.run(self._dequeue_ops)
-          status_logger.log('Outfeed finished for iteration (%d, %d)', count, i)
-      logging.info('Outfeed thread finished, shutting down.')
-
-  def _create_infeed_controller(self, name, target, args):
-    return _OpQueueContext(name=name, target=target, args=args)
-
-  def _assertCompilationSucceeded(self, result, coord):
-    proto = tpu_compilation_result.CompilationResultProto()
-    proto.ParseFromString(result)
-    if proto.status_error_message:
-      logging.error('Compilation failed: {}'.format(proto.status_error_message))
-      coord.request_stop()
-    else:
-      logging.info('Compilation succeeded')
-
-  def after_create_session(self, session, coord):
-    if self._should_initialize_tpu:
-      logging.info('Init TPU system')
-      start = time.time()
-      with ops.Graph().as_default():
-        with tf_session.Session(
-            self._master, config=self._session_config) as sess:
-          sess.run(
-              tpu.initialize_system(
-                  job=self._master_job,
-                  embedding_config=self._embedding_layer_config))
-      logging.info('Initialized TPU in %d seconds', time.time() - start)
-
-    session.run(self._init_ops,
-                options=config_pb2.RunOptions(timeout_in_ms=5 * 60 * 1000))
-
-    if os.environ.get('TPU_SPLIT_COMPILE_AND_EXECUTE', '') == '1':
-      logging.info('Compiling user program: this may take a while...')
-      self._assertCompilationSucceeded(session.run(self._tpu_compile_op), coord)
-
-    self._infeed_controller = self._create_infeed_controller(
-        name='InfeedController', target=self._run_infeed, args=(session,))
-
-    self._outfeed_controller = _OpQueueContext(
-        name='OutfeedController', target=self._run_outfeed, args=(session,))
-
-    # Enable the worker watchdog to terminate workers on coordinator exit.
-    watchdog_timeout = int(os.environ.get('TF_TPU_WATCHDOG_TIMEOUT', '0'))
-    if watchdog_timeout > 0:
-      session_support.start_worker_watchdog(session,
-                                            shutdown_timeout=watchdog_timeout)
-
-  def before_run(self, run_context):
-    iterations = run_context.session.run(self._iterations_per_loop_var)
-
-    logging.info('Enqueue next (%d) batch(es) of data to infeed.', iterations)
-    self._infeed_controller.send_next_batch_signal(iterations)
-
-    logging.info('Dequeue next (%d) batch(es) of data from outfeed.',
-                 iterations)
-    self._outfeed_controller.send_next_batch_signal(iterations)
-
-  def end(self, session):
-    logging.info('Stop infeed thread controller')
-    self._infeed_controller.join()
-    self._rendezvous.record_done('infeed')
-
-    logging.info('Stop output thread controller')
-    self._outfeed_controller.join()
-    self._rendezvous.record_done('outfeed')
-
-    logging.info('Shutdown TPU system.')
-    session.run(self._finalize_ops)
-
-
-class TPUInfeedOutfeedSessionHookForPrediction(TPUInfeedOutfeedSessionHook):
-
-  def __init__(self, ctx, enqueue_ops, dequeue_ops, tpu_compile_op,
-               rendezvous=None, master=None, session_config=None):
-    super(TPUInfeedOutfeedSessionHookForPrediction, self).__init__(
-        ctx,
-        enqueue_ops,
-        dequeue_ops,
-        tpu_compile_op=tpu_compile_op,
-        run_infeed_loop_on_coordinator=False,
-        rendezvous=rendezvous,
-        master=master,
-        session_config=session_config)
-
-  def _create_infeed_controller(self, name, target, args):
-    return _OpSignalOnceQueueContext(name=name, target=target, args=args)
-
-
-class _TPUStopAtStepHook(session_run_hook.SessionRunHook):
-  """Hook that requests stop at a specified step.
-
-  This hook is similar to the `session_run_hook._StopAfterNEvalsHook` with
-  following differences for TPU training:
-
-  1. This hook sets the variable for iterations_per_loop, which is used by
-     `TPUInfeedOutfeedSessionHook` to control the iterations for infeed/outfeed.
-     As the hook execution order is not guaranteed, the variable update is
-     handled in `after_create_session` and `after_run` as
-     `TPUInfeedOutfeedSessionHook` reads the variable value in `before_run`.
-
-  2. For each training loop (session.run), the global step could be increased
-     multiple times on TPU. The global step tensor value will be explicitly read
-     again in `after_run` to ensure the latest value is retrieved to avoid race
-     condition.
-  """
-
-  def __init__(self, iterations, num_steps=None, last_step=None):
-    """Initializes a `StopAtStepHook`.
-
-    Args:
-      iterations: The number of iterations to run optimizer per training loop.
-      num_steps: Number of steps to execute.
-      last_step: Step after which to stop.
-
-    Raises:
-      ValueError: If one of the arguments is invalid.
-    """
-    if num_steps is None and last_step is None:
-      raise ValueError('One of num_steps or last_step must be specified.')
-    if num_steps is not None and last_step is not None:
-      raise ValueError('Only one of num_steps or last_step can be specified.')
-    self._num_steps = num_steps
-    self._last_step = last_step
-    self._iterations = iterations
-
-  def _next_iterations(self, global_step, last_step):
-    gap = last_step - global_step
-    return min(gap, self._iterations)
-
-  def begin(self):
-    self._global_step_tensor = training_util.get_global_step()
-    if self._global_step_tensor is None:
-      raise RuntimeError('Global step should be created.')
-
-    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
-
-  def after_create_session(self, session, coord):
-    global_step = session.run(self._global_step_tensor)
-    if self._last_step is None:
-      self._last_step = global_step + self._num_steps
-
-    iterations = self._next_iterations(global_step, self._last_step)
-
-    self._iterations_per_loop_var.load(iterations, session=session)
-
-  def after_run(self, run_context, run_values):
-    # Global step cannot be retrieved via SessionRunArgs and before_run due to
-    # race condition.
-    global_step = run_context.session.run(self._global_step_tensor)
-    if global_step >= self._last_step:
-      run_context.request_stop()
-    else:
-      iterations = self._next_iterations(global_step, self._last_step)
-      self._iterations_per_loop_var.load(
-          iterations, session=run_context.session)
-
-
-class _SetEvalIterationsHook(session_run_hook.SessionRunHook):
-  """Hook that requests stop at a specified step."""
-
-  def __init__(self, num_steps):
-    """Initializes a `_SetEvalIterationsHook`.
-
-    Args:
-      num_steps: Number of steps to execute.
-    """
-    self._num_steps = num_steps
-
-  def begin(self):
-    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
-
-  def after_create_session(self, session, coord):
-    self._iterations_per_loop_var.load(self._num_steps, session=session)
-
-
-class _StoppingPredictHook(session_run_hook.SessionRunHook):
-  """Hook that requests stop according to the stopping signal in prediction."""
-
-  def __init__(self, scalar_stopping_signal):
-    self._scalar_stopping_signal = scalar_stopping_signal
-
-  def begin(self):
-    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
-
-  def after_create_session(self, session, coord):
-    # This is not necessary as we do not run infeed enqueue and outfeed dequeue
-    # in side threads for prediction model. But it makes the
-    # TPUInfeedOutfeedSessionHook prints nice message.
-    self._iterations_per_loop_var.load(1, session=session)
-
-  def before_run(self, run_context):
-    return session_run_hook.SessionRunArgs(self._scalar_stopping_signal)
-
-  def after_run(self, run_context, run_values):
-    _ = run_context
-    scalar_stopping_signal = run_values.results
-    if _StopSignals.should_stop(scalar_stopping_signal):
-      # NOTE(xiejw): In prediction, stopping signals are inserted for each
-      # batch. And we append one more batch to signal the system it should stop.
-      # The data flow might look like
-      #
-      #  batch   0: images, labels, stop = 0  (user provided)
-      #  batch   1: images, labels, stop = 0  (user provided)
-      #  ...
-      #  batch  99: images, labels, stop = 0  (user provided)
-      #  batch 100: images, labels, stop = 1  (TPUEstimator appended)
-      #
-      # where the final batch (id = 100) is appended by TPUEstimator, so we
-      # should drop it before returning the predictions to user.
-      # To achieve that, we throw the OutOfRangeError in after_run. Once
-      # Monitored Session sees this error in SessionRunHook.after_run, the
-      # "current" prediction, i.e., batch with id=100, will be discarded
-      # immediately
-      raise errors.OutOfRangeError(None, None, 'Stopped by stopping signal.')
-
-
-def generate_per_core_enqueue_ops_fn_for_host(
-    ctx, input_fn, inputs_structure_recorder, host_device, host_id):
-  """Generates infeed enqueue ops for per-core input_fn on a single host."""
-  captured_infeed_queue = _CapturedObject()
-  tpu_ordinal_function_impl = ctx.tpu_ordinal_function(host_id)
-
-  def enqueue_ops_fn():
-    """A fn returns enqueue_ops."""
-    num_cores_per_host = ctx.num_of_cores_per_host
-    per_host_sharded_inputs = []
-    for core_ordinal in range(num_cores_per_host):
-      with ops.name_scope('ordinal_%d' % (core_ordinal)):
-        user_context = tpu_context.TPUContext(
-            internal_ctx=ctx,
-            input_device=host_device,
-            invocation_index=host_id * ctx.num_of_cores_per_host + core_ordinal)
-        inputs = _Inputs.from_input_fn(input_fn(user_context))
-        if inputs.is_dataset:
-          raise TypeError(
-              '`input_fn` returning `Dataset`  is not yet supported in '
-              'per-Core input pipeline deployment yet. Please set '
-              'TPUConfig.per_host_input_for_training to True or return '
-              '`features` and `labels` from `input_fn`')
-        features, labels = inputs.features_and_labels()
-
-        inputs_structure_recorder.validate_and_record_structure(
-            features, labels)
-        flattened_inputs = (
-            inputs_structure_recorder.flatten_features_and_labels(
-                features, labels))
-        per_host_sharded_inputs.append(flattened_inputs)
-
-    infeed_queue = tpu_feed.InfeedQueue(
-        number_of_tuple_elements=len(per_host_sharded_inputs[0]))
-    captured_infeed_queue.capture(infeed_queue)
-
-    per_host_enqueue_ops = infeed_queue.generate_enqueue_ops(
-        per_host_sharded_inputs, tpu_ordinal_function=tpu_ordinal_function_impl)
-    return per_host_enqueue_ops
-
-  return enqueue_ops_fn, captured_infeed_queue
-
-
-def generate_per_host_enqueue_ops_fn_for_host(
-    ctx, input_fn, inputs_structure_recorder, batch_axis, device, host_id):
-  """Generates infeed enqueue ops for per-host input_fn on a single host."""
-  captured_infeed_queue = _CapturedObject()
-
-  dataset_initializer = None
-
-  with ops.device(device):
-    user_context = tpu_context.TPUContext(
-        internal_ctx=ctx, input_device=device, invocation_index=host_id)
-    inputs = _Inputs.from_input_fn(input_fn(user_context))
-
-    is_dataset = inputs.is_dataset
-    if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
-      if not is_dataset:
-        raise TypeError(
-            'For mode PREDICT, `input_fn` must return `Dataset` instead of '
-            '`features` and `labels`.')
-      if batch_axis is not None:
-        raise TypeError('For mode PREDICT, batch_axis is not supported yet.')
-      inputs = _InputsWithStoppingSignals(
-          dataset=inputs.dataset,
-          batch_size=ctx.batch_size_for_input_fn,
-          add_padding=True)
-
-    if is_dataset:
-      dataset_initializer = inputs.dataset_initializer()
-
-    tpu_ordinal_function_impl = ctx.tpu_ordinal_function(host_id)
-
-  def enqueue_ops_fn():
-    """A Fn returning the TPU infeed enqueue ops.
-
-    By providing as a Fn, it can be invoked inside the tf.while_loop such that
-    the input pipeline for multiple iterations can be executed by one
-    Session.run call.
-
-    Returns:
-      list of dict of ops.
-    """
-    with ops.device(device):
-      num_of_replicas_per_host = ctx.num_of_replicas_per_host
-      # Convert user input to features and labels.  If the user returns a
-      # dataset, it is initialized and the features and labels extracted via
-      # `dataset.iterator.get_next()`
-      features, labels = inputs.features_and_labels()
-      signals = inputs.signals()
-
-      inputs_structure_recorder.validate_and_record_structure(features, labels)
-      unsharded_tensor_list = (
-          inputs_structure_recorder.flatten_features_and_labels(
-              features, labels, signals))
-
-      infeed_queue = tpu_feed.InfeedQueue(
-          tuple_types=[t.dtype for t in unsharded_tensor_list],
-          tuple_shapes=[t.shape for t in unsharded_tensor_list],
-          shard_dimensions=batch_axis)
-      captured_infeed_queue.capture(infeed_queue)
-      infeed_queue.set_number_of_shards(num_of_replicas_per_host)
-      per_host_enqueue_ops = (
-          infeed_queue.split_inputs_and_generate_enqueue_ops(
-              unsharded_tensor_list,
-              placement_function=lambda x: device,
-              tpu_ordinal_function=tpu_ordinal_function_impl))
-      if signals is None:
-        return per_host_enqueue_ops
-      else:
-        return {
-            'ops': per_host_enqueue_ops,
-            'signals': signals,
-        }
-
-  return enqueue_ops_fn, captured_infeed_queue, dataset_initializer
-
-
-def generate_per_host_v2_enqueue_ops_fn_for_host(
-    ctx, input_fn, inputs_structure_recorder, device, host_id):
-  """Generates infeed enqueue ops for per-host input_fn on a single host."""
-  captured_infeed_queue = _CapturedObject()
-  dataset_initializer = None
-
-  with ops.device(device):
-    user_context = tpu_context.TPUContext(
-        internal_ctx=ctx, input_device=device, invocation_index=host_id)
-    inputs = _Inputs.from_input_fn(input_fn(user_context))
-
-    is_dataset = inputs.is_dataset
-    if not is_dataset:
-      raise TypeError('`input_fn` must return a `Dataset` for the PER_HOST_V2 '
-                      'input pipeline configuration.')
-
-    if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
-      inputs = _InputsWithStoppingSignals(
-          dataset=inputs.dataset,
-          batch_size=ctx.batch_size_for_input_fn,
-          add_padding=True,
-          num_invocations_per_step=ctx.num_of_replicas_per_host)
-
-    dataset_initializer = inputs.dataset_initializer()
-    tpu_ordinal_function_impl = ctx.tpu_ordinal_function(host_id)
-
-  def enqueue_ops_fn():
-    """Generates the per_host enqueue ops."""
-    control_deps = []
-    per_host_sharded_inputs = []
-    enqueue_datas_list = []
-    num_replicas_per_host = ctx.num_of_replicas_per_host
-    cached_signals = None
-    with ops.device(device):
-      if not inputs.is_dataset:
-        raise TypeError('`input_fn` must return a `Dataset` for this mode.')
-      for _ in range(num_replicas_per_host):
-        # Use control dependencies to ensure a deterministic ordering.
-        with ops.control_dependencies(control_deps):
-          features, labels = inputs.features_and_labels()  # Calls get_next()
-          signals = inputs.signals()
-
-          # All the replicas share the replica 0's stopping singal.
-          # This avoids inconsistent state among different model replcias.
-          if cached_signals:
-            signals['stopping'] = cached_signals['stopping']
-          else:
-            cached_signals = signals
-
-        features, labels, enqueue_data = (
-            _tpu_estimator_embedding.split_inputs(ctx, features, labels))
-        enqueue_datas_list.append(enqueue_data)
-
-        inputs_structure_recorder.validate_and_record_structure(
-            features, labels)
-        flattened_inputs = (
-            inputs_structure_recorder.flatten_features_and_labels(
-                features, labels, signals))
-        control_deps.extend(flattened_inputs)
-        per_host_sharded_inputs.append(flattened_inputs)
-
-      if inputs_structure_recorder.flattened_input_dims:
-        input_partition_dims = inputs_structure_recorder.flattened_input_dims
-        if signals:
-          input_partition_dims += [None] * len(signals)
-        # pylint: disable=protected-access
-        infeed_queue = tpu_feed._PartitionedInfeedQueue(
-            number_of_tuple_elements=len(per_host_sharded_inputs[0]),
-            host_id=host_id,
-            input_partition_dims=input_partition_dims,
-            device_assignment=ctx.device_assignment)
-        per_host_enqueue_ops = infeed_queue.generate_enqueue_ops(
-            per_host_sharded_inputs)
-      else:
-        infeed_queue = tpu_feed.InfeedQueue(
-            number_of_tuple_elements=len(per_host_sharded_inputs[0]))
-        per_host_enqueue_ops = infeed_queue.generate_enqueue_ops(
-            per_host_sharded_inputs,
-            tpu_ordinal_function=tpu_ordinal_function_impl)
-      captured_infeed_queue.capture(infeed_queue)
-
-    if ctx.embedding_config:
-      per_host_enqueue_ops.extend(
-          ctx.embedding_config.tpu_embedding.generate_enqueue_ops(
-              enqueue_datas_list))
-
-    if signals is None:
-      return per_host_enqueue_ops
-    else:
-      return {
-          'ops': per_host_enqueue_ops,
-          'signals': signals,
-      }
-
-  return enqueue_ops_fn, captured_infeed_queue, dataset_initializer
-
-
-def generate_broadcast_enqueue_ops_fn(ctx, input_fn, inputs_structure_recorder,
-                                      num_hosts):
-  """Generates infeed enqueue ops for one input_fn on all the hosts."""
-  captured_infeed_queue = _CapturedObject()
-  dataset_initializer = None
-  device_0 = ctx.tpu_host_placement_function(host_id=0)
-  with ops.device(device_0):
-    user_context = tpu_context.TPUContext(
-        internal_ctx=ctx, input_device=device_0, invocation_index=0)
-    inputs = _Inputs.from_input_fn(input_fn(user_context))
-
-    is_dataset = inputs.is_dataset
-    if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
-      if not is_dataset:
-        raise TypeError(
-            'For mode PREDICT, `input_fn` must return `Dataset` instead of '
-            '`features` and `labels`.')
-
-      inputs = _InputsWithStoppingSignals(
-          dataset=inputs.dataset,
-          batch_size=ctx.batch_size_for_input_fn,
-          add_padding=True)
-
-    if is_dataset:
-      dataset_initializer = inputs.dataset_initializer()
-    num_replicas_per_host = ctx.num_of_replicas_per_host
-
-  def tpu_ordinal_function_impl(replica_id):
-    if ctx.device_assignment:
-      return ctx.device_assignment.tpu_ordinal(replica=replica_id)
-    else:
-      return replica_id % num_replicas_per_host
-
-  def device_function_impl(replica_id):
-    return ctx.tpu_host_placement_function(replica_id=replica_id)
-
-  def enqueue_ops_fn():
-    """Generates enqueue ops for all the hosts."""
-    broadcasted_inputs = []
-    flattened_inputs = None  # Cache result from input_fn.
-    signals = None
-    num_replicas = ctx.num_replicas
-    core_id = 0
-    for host_id in xrange(num_hosts):
-      with ops.device(ctx.tpu_host_placement_function(host_id=host_id)):
-        for _ in xrange(ctx.num_of_replicas_per_host):
-          # Note: input_fn is only called once at host 0 for the first replica.
-          # The features and labels returned from that invocation are
-          # broadcasted to other replicas(including the replicas on other
-          # hosts).
-          if flattened_inputs is None:
-            features, labels = inputs.features_and_labels()  # Calls get_next()
-            signals = inputs.signals()
-
-            inputs_structure_recorder.validate_and_record_structure(
-                features, labels)
-            flattened_inputs = (
-                inputs_structure_recorder.flatten_features_and_labels(
-                    features, labels, signals))
-            if (ctx.config.tpu_config.eval_training_input_configuration is
-                tpu_config.InputPipelineConfig.SLICED):
-              input_slices = [
-                  array_ops.split(x, num_replicas) for x in flattened_inputs
-              ]
-          if (ctx.config.tpu_config.eval_training_input_configuration is
-              tpu_config.InputPipelineConfig.SLICED):
-            # for each core, slice out the flattened_inputs for each core.
-            broadcasted_inputs.append([x[core_id] for x in input_slices])
-            core_id += 1
-          else:
-            broadcasted_inputs.append(flattened_inputs)
-
-    infeed_queue = tpu_feed.InfeedQueue(
-        number_of_tuple_elements=len(broadcasted_inputs[0]))
-    captured_infeed_queue.capture(infeed_queue)
-    enqueue_ops = infeed_queue.generate_enqueue_ops(
-        broadcasted_inputs,
-        tpu_ordinal_function=tpu_ordinal_function_impl,
-        placement_function=device_function_impl)
-
-    if signals is None:
-      return enqueue_ops
-    else:
-      return {
-          'ops': enqueue_ops,
-          'signals': signals,
-      }
-
-  return enqueue_ops_fn, captured_infeed_queue, dataset_initializer
-
-
-class _InputPipeline(object):
-  """`_InputPipeline` handles invoking `input_fn` and piping to infeed queue.
-
-  `_InputPipeline` abstracts the per-core/per-host `input_fn` invocation from
-  call site.  To be precise, based on the configuration in
-  `_InternalTPUContext`,  it invokes `input_fn` for all cores (usually
-  multi-host TPU training) or for one host (usually for single-host TPU
-  evaluation), and sends all `features` and `labels` returned by `input_fn` to
-  TPU infeed. For per-core invocation, `features` and `labels` are piped to
-  infeed directly, one tuple for each core. For per-host invocation,  `features`
-  and `labels` are split at host (with respect to `batch_axis`) and piped to all
-  cores accordingly.
-
-  In addition, flatten/unflatten are handled by `_InputPipeline` also.  Model
-  inputs returned by the `input_fn` can have one of the following forms:
-  1. features
-  2. (features, labels)
-  3. ((arbitrarily nested structure of features), labels)
-
-  Internally, form 1 is reformed to `(features, None)` as features and labels
-  are passed separately to underlying methods. For TPU training, TPUEstimator
-  may expect multiple `features` and `labels` tuples one for each core.
-
-  TPUEstimator allows various different structures for inputs (namely `features`
-  and `labels`).  Both `features` and `labels` can be any nested sturcture
-  supported by TF nest (namely, dict, tuples, namedtuples or any nested
-  structure of such of Tensors).  `labels` could be `None` as well.
-
-  These are flattened before they are passed to the infeed/outfeed library
-  as that expectes flattend lists.
-  """
-
-  class InputsStructureRecorder(object):
-    """The recorder to record inputs structure."""
-
-    def __init__(self, input_partition_dims=None):
-      # Holds the structure of inputs
-      self._feature_structure = {}
-      self._flattened_input_dims = None
-
-      if input_partition_dims:
-        # This should have been validated in TPUConfig.
-        assert len(input_partition_dims) <= 2, 'must have 1 or 2 elements.'
-        if len(input_partition_dims) == 2:
-          self._feature_dims, self._label_dims = input_partition_dims
-        else:
-          self._feature_dims = input_partition_dims[0]
-          self._label_dims = None
-
-        assert self._feature_dims is not None, ('input_partition_dims[0] must '
-                                                'not be None')
-      else:
-        self._feature_dims = None
-        self._label_dims = None
-
-      # Internal state.
-      self._initialized = False
-
-    @property
-    def flattened_input_dims(self):
-      assert self._initialized, 'InputsStructureRecorder is not initialized.'
-      return self._flattened_input_dims
-
-    def has_labels(self):
-      return 'labels' in self._feature_structure
-
-    def _flatten_input_dims(self, feature_dims, feature_dims_names, label_dims,
-                            label_dims_names, label_names, has_labels):
-      """Flatten input dims with the same order as flattened input tensors."""
-      flattened_input_dims = []
-      if feature_dims_names:
-        # We need a fixed ordering for matching the tensors in features.
-        flattened_input_dims.extend(
-            [feature_dims[name] for name in feature_dims_names])
-      else:
-        flattened_input_dims.append(feature_dims)
-
-      if label_dims_names:
-        # We need a fixed ordering for matching the tensors in labels.
-        flattened_input_dims.extend(
-            [label_dims[name] for name in label_dims_names])
-      else:
-        if label_names:
-          num_tensors_in_label = len(label_names)
-        else:
-          num_tensors_in_label = int(has_labels)
-        # Setting `None` in input_partition_dims[1] will apply `None` to
-        # all the tensors in labels, regardless of internal structure.
-        flattened_input_dims.extend([label_dims] * num_tensors_in_label)
-
-      return flattened_input_dims
-
-    def validate_and_record_structure(self, features, labels):
-      """Validates and records the structure of `features` and `labels`."""
-      # Extract structure.
-      has_labels = labels is not None
-      feature_names = _extract_key_names(features)
-      label_names = _extract_key_names(labels)
-
-      if not self._initialized:
-        # Record structure.
-        self._initialized = True
-        if self._feature_dims is not None:
-          feature_dims_names = _extract_key_names(self._feature_dims)
-          if feature_dims_names != feature_names:
-            raise ValueError(
-                'TPUConfig.input_partition_dims[0] mismatched feature'
-                ' keys. Expected {}, got {}'.format(feature_names,
-                                                    feature_dims_names))
-
-          label_dims_names = _extract_key_names(self._label_dims)
-          if self._label_dims is not None and label_dims_names != label_names:
-            raise ValueError(
-                'TPUConfig.input_partition_dims[1] mismatched label'
-                ' keys. Expected {}, got {}'.format(label_names,
-                                                    label_dims_names))
-
-          self._flattened_input_dims = self._flatten_input_dims(
-              self._feature_dims, feature_dims_names, self._label_dims,
-              label_dims_names, label_names, has_labels)
-
-    def flatten_features_and_labels(self, features, labels, signals=None):
-      """Flattens the `features` and `labels` to a single tensor list."""
-      self._feature_structure['features'] = features
-      if labels is not None:
-        self._feature_structure['labels'] = labels
-      if signals is not None:
-        self._feature_structure['signals'] = signals
-      return data_nest.flatten(self._feature_structure)
-
-    def unflatten_features_and_labels(self, flattened_inputs):
-      """Restores the flattened inputs to original features and labels form.
-
-      Args:
-        flattened_inputs: Flattened inputs for each shard.
-
-      Returns:
-        A tuple of (`features`, `labels`), where `labels` could be None.
-        Each one, if present, should have identical structure (single tensor vs
-        dict) as the one returned by input_fn.
-
-      Raises:
-        ValueError: If the number of expected tensors from `flattened_inputs`
-          mismatches the recorded structure.
-      """
-
-      unflattened_inputs = data_nest.pack_sequence_as(self._feature_structure,
-                                                      flattened_inputs)
-      return _Inputs(
-          unflattened_inputs['features'],
-          unflattened_inputs.get('labels'),
-          signals=unflattened_inputs.get('signals'))
-
-  def __init__(self, input_fn, batch_axis, ctx):
-    """Constructor.
-
-    Args:
-      input_fn: input fn for train or eval.
-      batch_axis: A python tuple of int values describing how each tensor
-        produced by the Estimator `input_fn` should be split across the TPU
-        compute shards.
-      ctx: A `_InternalTPUContext` instance with mode.
-
-    Raises:
-      ValueError: If both `sharded_features` and `num_cores` are `None`.
-    """
-    self._inputs_structure_recorder = _InputPipeline.InputsStructureRecorder(
-        ctx.input_partition_dims)
-
-    self._sharded_per_core = ctx.is_input_sharded_per_core()
-    self._input_fn = input_fn
-    self._infeed_queue = None
-    self._ctx = ctx
-    self._batch_axis = batch_axis
-
-  def generate_infeed_enqueue_ops_and_dequeue_fn(self):
-    """Generates infeed enqueue ops and dequeue_fn."""
-    # While tf.while_loop is called, the body function, which invokes
-    # `enqueue_fn` passed in, is called to construct the graph. So, input_fn
-    # structure is recorded.
-    enqueue_ops, all_hooks, run_infeed_loop_on_coordinator = (
-        self._invoke_input_fn_and_record_structure())
-
-    self._validate_input_pipeline()
-
-    def dequeue_fn():
-      """dequeue_fn is used by TPU to retrieve the tensors."""
-      # In the model-parallel case, both the host-side and device-side
-      # computations must agree on the core on which infeed takes place. We
-      # choose to perform infeed on logical core 0 of each replica.
-      values = self._infeed_queue.generate_dequeue_op(tpu_device=0)
-      # The unflatten process uses the structure information recorded above.
-      return self._inputs_structure_recorder.unflatten_features_and_labels(
-          values)
-
-    return (enqueue_ops, dequeue_fn, all_hooks, run_infeed_loop_on_coordinator)
-
-  def _invoke_input_fn_and_record_structure(self):
-    """Deploys the input pipeline and record input structure."""
-    enqueue_ops = []
-    infeed_queues = []
-    all_dataset_initializers = []
-    num_hosts = self._ctx.num_hosts
-    tpu_host_placement_fn = self._ctx.tpu_host_placement_function
-
-    run_infeed_loop_on_coordinator = True
-
-    if self._sharded_per_core:
-      # Per-Core input pipeline deployment.
-      # Invoke input pipeline for each core and placed on the corresponding
-      # host.
-      for host_id in range(num_hosts):
-        host_device = tpu_host_placement_fn(host_id=host_id)
-        with ops.device(host_device):
-          with ops.name_scope('input_pipeline_task%d' % (host_id)):
-            enqueue_ops_fn, captured_infeed_queue = (
-                generate_per_core_enqueue_ops_fn_for_host(
-                    self._ctx, self._input_fn, self._inputs_structure_recorder,
-                    host_device, host_id))
-
-            if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
-              run_infeed_loop_on_coordinator = False
-              enqueue_ops.append(
-                  _wrap_computation_in_while_loop(
-                      device=host_device, op_fn=enqueue_ops_fn))
-            else:
-              enqueue_ops.append(enqueue_ops_fn())
-            # Infeed_queue_getter must be called after enqueue_ops_fn is called.
-            infeed_queues.append(captured_infeed_queue.get())
-
-    elif self._ctx.is_input_broadcast_with_iterators():
-      # Only calls input_fn in host 0.
-      host_device = tpu_host_placement_fn(host_id=0)
-      enqueue_ops_fn, captured_infeed_queue, dataset_initializer = (
-          generate_broadcast_enqueue_ops_fn(self._ctx, self._input_fn,
-                                            self._inputs_structure_recorder,
-                                            num_hosts))
-      if dataset_initializer:
-        all_dataset_initializers.append(dataset_initializer)
-        run_infeed_loop_on_coordinator = False
-        wrap_fn = (
-            _wrap_computation_in_while_loop
-            if self._ctx.mode != model_fn_lib.ModeKeys.PREDICT else
-            _wrap_computation_in_while_loop_with_stopping_signals)
-        enqueue_ops.append(wrap_fn(device=host_device, op_fn=enqueue_ops_fn))
-      else:
-        enqueue_ops.append(enqueue_ops_fn())
-      infeed_queues.append(captured_infeed_queue.get())
-    else:
-      for host_id in range(num_hosts):
-        host_device = tpu_host_placement_fn(host_id=host_id)
-        with ops.device(host_device):
-          with ops.name_scope('input_pipeline_task%d' % (host_id)):
-            if self._ctx.is_input_per_host_with_iterators():
-              enqueue_ops_fn, captured_infeed_queue, dataset_initializer = (
-                  generate_per_host_v2_enqueue_ops_fn_for_host(
-                      self._ctx, self._input_fn,
-                      self._inputs_structure_recorder, host_device, host_id))
-            else:
-              enqueue_ops_fn, captured_infeed_queue, dataset_initializer = (
-                  generate_per_host_enqueue_ops_fn_for_host(
-                      self._ctx, self._input_fn,
-                      self._inputs_structure_recorder, self._batch_axis,
-                      host_device, host_id))
-
-            # NOTE(xiejw): We dispatch here based on the return type of the
-            # users `input_fn`.
-            #
-            # 1. If input_fn returns a Dataset instance, we initialize the
-            # iterator outside of tf.while_loop, and call the iterator.get_next
-            # inside tf.while_loop.  This should be always safe.
-            #
-            # 2. If input_fn returns (features, labels), it is too late to wrap
-            # them inside tf.while_loop, as resource initialization cannot be
-            # handled in TF control flow properly. In this case, we will use
-            # python loop to enqueue the data into TPU system.  This may be
-            # slow compared to the previous case.
-            if dataset_initializer:
-              all_dataset_initializers.append(dataset_initializer)
-              run_infeed_loop_on_coordinator = False
-              wrap_fn = (
-                  _wrap_computation_in_while_loop
-                  if self._ctx.mode != model_fn_lib.ModeKeys.PREDICT else
-                  _wrap_computation_in_while_loop_with_stopping_signals)
-              enqueue_ops.append(
-                  wrap_fn(device=host_device, op_fn=enqueue_ops_fn))
-            else:
-              enqueue_ops.append(enqueue_ops_fn())
-            infeed_queues.append(captured_infeed_queue.get())
-    # infeed_queue is used to generate dequeue ops. The only thing it uses for
-    # dequeue is dtypes and types. So, any one can be used. Here, grab the
-    # first one.
-    self._infeed_queue = infeed_queues[0]
-    return enqueue_ops, [
-        util_lib.MultiHostDatasetInitializerHook(all_dataset_initializers)
-    ], run_infeed_loop_on_coordinator
-
-  def _validate_input_pipeline(self):
-    """Validates the input pipeline.
-
-    Perform some sanity checks to log user friendly information. We should
-    error out to give users better error message. But, if
-    _WRAP_INPUT_FN_INTO_WHILE_LOOP is False (legacy behavior), we cannot break
-    user code, so, log a warning.
-
-    Raises:
-      RuntimeError: If the validation failed.
-    """
-    if ops.get_default_graph().get_collection(ops.GraphKeys.QUEUE_RUNNERS):
-      err_msg = ('Input pipeline contains one or more QueueRunners. '
-                 'It could be slow and not scalable. Please consider '
-                 'converting your input pipeline to use `tf.data` instead (see '
-                 'https://www.tensorflow.org/guide/datasets for '
-                 'instructions.')
-      if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
-        raise RuntimeError(err_msg)
-      else:
-        logging.warn(err_msg)
-
-
-def call_computation(computation_inputs,
-                     computation,
-                     experimental_export_device_assignment,
-                     batch_config=None):
-  """Call computation.
-
-  Args:
-    computation_inputs: A tensor or dict of tensors, the inputs to the
-      computation.
-    computation: A Python function that takes no inputs and builds computation
-      graph. If `computation` returns m outputs, this function will return a
-      list of m Tensors.
-    experimental_export_device_assignment: If `True`, use user-provided device
-      assignment. If `False`, round-robin computation among all TPU cores
-      visible to the host.
-    batch_config: A BatchConfig named tuple specifying the batching
-      configuration to use for inference batching.
-
-  Returns:
-    A list of output tensors.
-  """
-  if experimental_export_device_assignment:
-    return computation(computation_inputs)
-
-  # Using `TPUPartitionedCall` makes it possible to target a different
-  # TPU core with every `Session.run()` call. Note that the entire inference
-  # graph executes on a single core, and that invocations of this graph
-  # will round-robin among the cores attached to a host.
-  def tpu_partitioned_call(partition_inputs):
-
-    # capture_resource_var_by_value enables variables to be mirrored on TPU
-    # to avoid fetching from CPU, since variables do not change during
-    # inference.
-    @function.Defun(capture_resource_var_by_value=False)
-    def tpu_subgraph():
-      return computation(partition_inputs)
-
-    return tpu_functional.TPUPartitionedCall(
-        args=tpu_subgraph.captured_inputs,
-        device_ordinal=tpu_ops.tpu_ordinal_selector(),
-        Tout=[o.type for o in tpu_subgraph.definition.signature.output_arg],
-        f=tpu_subgraph)
-
-  # Not using Batching Function but use TPUPartitionedCall/all cores.
-  if not batch_config:
-    return tpu_partitioned_call(computation_inputs)
-
-  # Use Batching Function and TPUPartitionedCall/all cores.
-  # Note that BatchingFunction requires a list of tensors and doesn't support
-  # a dict of tensors. So we preserve the structure by deterministically
-  # flattening the dict before batching and then recomposing it after batching
-  # to feed into the computation.
-  ordered_inputs_list = nest.flatten(computation_inputs)
-
-  @batch_ops.batch_function(
-      num_batch_threads=batch_config.num_batch_threads,
-      max_batch_size=batch_config.max_batch_size,
-      batch_timeout_micros=batch_config.batch_timeout_micros,
-      allowed_batch_sizes=batch_config.allowed_batch_sizes,
-      max_enqueued_batches=batch_config.max_enqueued_batches,
-      autograph=False)
-  def batched_tpu_computation(*tensor_args):
-    """Recompose the input feature dict and calls the TPU computation."""
-    computation_feature_input = nest.pack_sequence_as(computation_inputs,
-                                                      tensor_args)
-    return tpu_partitioned_call(computation_feature_input)
-
-  return batched_tpu_computation(*ordered_inputs_list)
-
-
-class _ModelFnWrapper(object):
-  """A `model_fn` wrapper.
-
-  This makes calling model_fn on CPU and TPU easier and more consistent and
-  performs necessary check and mutation required by TPU training and evaluation.
-
-  In addition, this wrapper manages converting the `model_fn` to a single TPU
-  train and eval step.
-  """
-
-  def __init__(self, model_fn, config, params, ctx):
-    self._model_fn = model_fn
-    self._config = config
-    self._params = params
-    self._ctx = ctx
-
-  def call_without_tpu(self, features, labels, is_export_mode):
-    return self._call_model_fn(features, labels, is_export_mode=is_export_mode)
-
-  def _add_embedding_features(self, features, hook_dummy_table_variables):
-    """Add embedding features, optionally add hook to intercept gradient."""
-    if self._ctx.embedding_config:
-      tpu_embedding_ = self._ctx.embedding_config.tpu_embedding
-      embedding_activations = tpu_embedding_.get_activations()
-      if hook_dummy_table_variables:
-        new_embedding_activations = (
-            tpu_embedding_gradient.hook_dummy_table_variables_to_activations(
-                tpu_embedding_, embedding_activations,
-                self._ctx.embedding_config.dummy_table_variables))
-        features.update(new_embedding_activations)
-      else:
-        features.update(embedding_activations)
-
-  def convert_to_single_tpu_train_step(self, dequeue_fn):
-    """Converts user provided model_fn` as a single train step on TPU.
-
-    The user provided `model_fn` takes input tuple
-    (features, labels) and produces the EstimatorSpec with train_op and loss for
-    train `mode`. This usually represents a single train computation on CPU.
-
-    For TPU training, a train (computation) step is first wrapped in a
-    tf.while_loop control flow to repeat for many times and then replicated to
-    all TPU shards. Besides the input should be taken from TPU infeed rather
-    than input pipeline (input_fn) directly. To fit TPU loop and replicate
-    pattern, the original train computation should be reformed, which is the
-    returned `train_step`.
-
-    Args:
-      dequeue_fn: The function to retrieve inputs, features and labels, from TPU
-        infeed dequeue channel.
-
-    Returns:
-      A tuple of train_fn, host_calls, and captured scaffold_fn. The train_fn
-      representing the train step for TPU.
-    """
-
-    host_call = _OutfeedHostCall(self._ctx)
-    captured_scaffold_fn = _CapturedObject()
-    captured_training_hooks = _CapturedObject()
-
-    def train_step(loss):
-      """Training step function for use inside a while loop."""
-      del loss  # unused; required in function signature.
-      inputs = dequeue_fn()
-      features, labels = inputs.features_and_labels()
-      self._add_embedding_features(features, True)
-
-      estimator_spec = self._verify_estimator_spec(
-          self._call_model_fn(features, labels))
-      loss, train_op = estimator_spec.loss, estimator_spec.train_op
-
-      if isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
-        captured_scaffold_fn.capture(estimator_spec.scaffold_fn)
-      else:
-        captured_scaffold_fn.capture(None)
-
-      captured_training_hooks.capture(estimator_spec.training_hooks)
-
-      if self._ctx.embedding_config is None:
-        apply_sparse_grads = []
-      else:
-        tpu_embedding_ = self._ctx.embedding_config.tpu_embedding
-        gradients = (
-            tpu_embedding_gradient.get_gradients_through_dummy_table_variables(
-                tpu_embedding_)
-        )
-        grad_multiplier = self._ctx.embedding_config.get_grad_multiplier()
-        if grad_multiplier is not None:
-          scaled_gradients = collections.OrderedDict(
-              (k, v * grad_multiplier) for k, v in six.iteritems(gradients))
-        else:
-          scaled_gradients = gradients
-        apply_sparse_grads = [
-            tpu_embedding_.generate_send_gradients_op(scaled_gradients)
-        ]
-
-      # We must run train_op to update the variables prior to running the
-      # outfeed.
-      with ops.control_dependencies([train_op] + apply_sparse_grads):
-        host_call_outfeed_ops = []
-        host_call_fn, host_call_args = None, []
-
-        if (isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec)  # pylint: disable=protected-access
-            and estimator_spec.host_call is not None):
-          host_call_fn, host_call_args = estimator_spec.host_call
-
-        if host_call_fn:
-          # Ignore dummy hostcalls (no arguments)
-          if host_call_args:
-            host_call.record({'host_call': estimator_spec.host_call})
-            host_call_outfeed_ops = host_call.create_enqueue_op()
-        else:
-          # Create a host call for the loss to track execution progress
-          # Without this, we don't have any indication of the state of the
-          # TPU program.
-          host_call.record({
-              'host_call': (lambda loss_t: loss_t,
-                            [array_ops.reshape(loss, [1])])
-          })
-          host_call_outfeed_ops = host_call.create_enqueue_op()
-
-        with ops.control_dependencies(host_call_outfeed_ops):
-          return array_ops.identity(loss)
-
-    return (train_step, host_call, captured_scaffold_fn,
-            captured_training_hooks)
-
-  def convert_to_single_tpu_eval_step(self, dequeue_fn):
-    """Converts user provided model_fn` as a single eval step on TPU.
-
-    Similar to training, the user provided `model_fn` takes input tuple
-    (features, labels) and produces the TPUEstimatorSpec with eval_metrics for
-    eval `mode`. This usually represents a single evaluation computation on CPU.
-
-    For TPU evaluation, a eval (computation) step is first wrapped in a
-    tf.while_loop control flow to repeat for many times and then replicated to
-    all TPU shards. Besides the input and output are slightly different. Input,
-    features and labels, should be taken from TPU infeed rather than input
-    pipeline (input_fn) directly. Output is managed in two stages.  First, the
-    model outputs as the result of evaluation computation, usually model logits,
-    should be transferred from TPU system to CPU. Then, all model outputs are
-    concatenated first on CPU and sent to the metric_fn for metrics computation.
-    To fit TPU evaluation pattern, the original eval computation should be
-    reformed, which is the returned `eval_step`.
-
-    Args:
-      dequeue_fn: The function to retrieve inputs, features and labels, from TPU
-        infeed dequeue channel.
-
-    Returns:
-      A tuple of eval_fn, host_calls, and captured scaffold_fn. The eval_fn
-      representing the eval step for TPU.
-    """
-    host_calls = _OutfeedHostCall(self._ctx)
-    captured_scaffold_fn = _CapturedObject()
-    captured_eval_hooks = _CapturedObject()
-
-    def eval_step(total_loss):
-      """Evaluation step function for use inside a while loop."""
-      inputs = dequeue_fn()
-      features, labels = inputs.features_and_labels()
-      self._add_embedding_features(features, False)
-
-      tpu_estimator_spec = self._call_model_fn(features, labels)
-      if not isinstance(tpu_estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
-        raise RuntimeError(
-            'estimator_spec used by TPU evaluation must have type'
-            '`TPUEstimatorSpec`. Got {}'.format(type(tpu_estimator_spec)))
-
-      loss = tpu_estimator_spec.loss
-      captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn)
-      captured_eval_hooks.capture(tpu_estimator_spec.evaluation_hooks)
-
-      to_record = {}
-      if tpu_estimator_spec.eval_metrics:
-        to_record['eval_metrics'] = tpu_estimator_spec.eval_metrics
-      if tpu_estimator_spec.host_call is not None:
-        # We assume that evaluate won't update global step, so we don't wrap
-        # this host_call.
-        to_record['host_call'] = tpu_estimator_spec.host_call
-      host_calls.record(to_record)
-
-      with ops.control_dependencies(host_calls.create_enqueue_op()):
-        return math_ops.add(total_loss, loss)
-
-    return eval_step, host_calls, captured_scaffold_fn, captured_eval_hooks
-
-  def convert_to_single_tpu_predict_step(self, dequeue_fn):
-    """Converts user provided model_fn` as a single predict step on TPU.
-
-    Args:
-      dequeue_fn: The function to retrieve inputs, features and labels, from TPU
-        infeed dequeue channel.
-
-    Returns:
-      A tuple of predict_fn, host_calls, and captured scaffold_fn. The
-      predict_fn representing the predict step for TPU.
-    """
-    host_calls = _OutfeedHostCall(self._ctx)
-    captured_scaffold_fn = _CapturedObject()
-    captured_predict_hooks = _CapturedObject()
-
-    def predict_step(unused_scalar_stopping_signal):
-      """Evaluation step function for use inside a while loop."""
-      inputs = dequeue_fn()
-      features, labels = inputs.features_and_labels()
-      stopping_signals = inputs.signals()
-
-      assert stopping_signals is not None, (
-          'Internal Error: `signals` is missing.')
-
-      tpu_estimator_spec = self._call_model_fn(
-          features, labels, is_export_mode=False)
-      if not isinstance(tpu_estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
-        raise RuntimeError(
-            'estimator_spec used by TPU prediction must have type'
-            '`TPUEstimatorSpec`. Got {}'.format(type(tpu_estimator_spec)))
-
-      self._verify_tpu_spec_predictions(tpu_estimator_spec.predictions)
-
-      captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn)
-      captured_predict_hooks.capture(tpu_estimator_spec.prediction_hooks)
-      to_record = {}
-      identity_fn = lambda **kwargs: kwargs
-      to_record['predictions'] = [identity_fn, tpu_estimator_spec.predictions]
-      to_record['signals'] = [identity_fn, stopping_signals]
-      if tpu_estimator_spec.host_call is not None:
-        to_record['host_call'] = tpu_estimator_spec.host_call
-      host_calls.record(to_record)
-
-      with ops.control_dependencies(host_calls.create_enqueue_op()):
-        return _StopSignals.as_scalar_stopping_signal(stopping_signals)
-
-    return (predict_step, host_calls, captured_scaffold_fn,
-            captured_predict_hooks)
-
-  def _verify_tpu_spec_predictions(self, predictions):
-    """Validates TPUEstimatorSpec.predictions dict."""
-    # TODO(xiejw): Adds validation for prediction dictionrary.
-    # TODO(xiejw): Adds support for single tensor as predictions.
-    if not isinstance(predictions, dict):
-      raise TypeError('TPUEstimatorSpec.predictions must be dict of Tensors.')
-
-    for (key, tensor) in predictions.items():
-      if tensor.shape.dims[0].value is None:
-        raise ValueError(
-            'The tensor with key ({}) in TPUEstimatorSpec.predictions has '
-            'dynamic shape (should be static). Tensor: {}'.format(key, tensor))
-    return predictions
-
-  def _validate_model_features_and_labels(self, features, labels,
-                                          is_export_mode):
-    """Validates that the features and labels for the model function are valid.
-
-    A valid features/labels object is the one with:
-    - Type: A tensor or any nested structure of tensors supported by TF nest,
-        namely nested dictionary, tuple, namedtuple, or sequence of tensors.
-    - Static shape if is_export_mode is False.
-
-    Args:
-      features: the features that would be input to the model function.
-      labels: the labels that would be input to the model function.
-      is_export_mode: boolean value specifying if in export mode.
-
-    Raises:
-      TypeError: If features/labels are not of the correct type.
-      ValueError: If features/labels have dynamic shape.
-    """
-
-    def validate(obj, obj_name):
-      """Helper validate function."""
-      if is_export_mode or self._ctx.is_running_on_cpu(is_export_mode):
-        return
-      if isinstance(obj, ops.Tensor):
-        if not obj.get_shape().is_fully_defined():
-          raise ValueError(
-              'The {} to the model returned by input_fn must have static shape.'
-              ' Tensor: {}'.format(obj_name, obj))
-      else:
-        for tensor in data_nest.flatten(obj):
-          if not tensor.get_shape().is_fully_defined():
-            raise ValueError(
-                ('The {} to the model returned by input_fn must have static '
-                 'shape. Tensor: {}').format(obj_name, tensor))
-
-    validate(features, 'features')
-    if labels is not None:
-      validate(labels, 'labels')
-
-  def _call_model_fn(self, features, labels, is_export_mode=False):
-    """Calls the model_fn with required parameters."""
-    self._validate_model_features_and_labels(features, labels, is_export_mode)
-    model_fn_args = function_utils.fn_args(self._model_fn)
-    kwargs = {}
-
-    # Makes deep copy with `config` and params` in case user mutates them.
-    config = copy.deepcopy(self._config)
-    params = copy.deepcopy(self._params)
-
-    if 'labels' in model_fn_args:
-      kwargs['labels'] = labels
-    elif labels is not None:
-      raise ValueError(
-          'model_fn does not take labels, but input_fn returns labels.')
-    if 'mode' in model_fn_args:
-      kwargs['mode'] = self._ctx.mode
-    if 'config' in model_fn_args:
-      kwargs['config'] = config
-    if 'params' in model_fn_args:
-      kwargs['params'] = params
-
-    if 'params' not in model_fn_args:
-      raise ValueError('model_fn ({}) does not include params argument, '
-                       'required by TPUEstimator to pass batch size as '
-                       'params[\'batch_size\']'.format(self._model_fn))
-
-    if is_export_mode:
-      batch_size_for_model_fn = None
-    else:
-      batch_size_for_model_fn = self._ctx.batch_size_for_model_fn
-
-    if batch_size_for_model_fn is not None:
-      _add_item_to_params(params, _BATCH_SIZE_KEY, batch_size_for_model_fn)
-
-    running_on_cpu = self._ctx.is_running_on_cpu(is_export_mode)
-    # In export mode, params['use_tpu'] has already been set based on mode
-    # (i.e. True for _REWRITE_FOR_INFERENCE_MODE, False otherwise).
-    if not is_export_mode:
-      _add_item_to_params(params, _USE_TPU_KEY, not running_on_cpu)
-
-    if not running_on_cpu:
-      user_context = tpu_context.TPUContext(
-          internal_ctx=self._ctx, call_from_input_fn=False)
-      _add_item_to_params(params, _CTX_KEY, user_context)
-
-    estimator_spec = self._model_fn(features=features, **kwargs)
-    if (running_on_cpu and
-        isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec)):  # pylint: disable=protected-access
-      # The estimator_spec will be passed to `Estimator` directly, which expects
-      # type `EstimatorSpec`.
-      return estimator_spec.as_estimator_spec()
-    else:
-      return estimator_spec
-
-  def _verify_estimator_spec(self, estimator_spec):
-    """Validates the estimator_spec."""
-    if isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
-      return estimator_spec
-
-    err_msg = '{} returned by EstimatorSpec is not supported in TPUEstimator.'
-    if estimator_spec.training_chief_hooks:
-      raise ValueError(
-          err_msg.format('training_chief_hooks') + 'If you want' +
-          ' to pass training hooks, please pass via training_hooks.')
-
-    if estimator_spec.scaffold:
-      logging.warning('EstimatorSpec.Scaffold is ignored by TPU train/eval. '
-                      'Please use TPUEstimatorSpec.')
-    return estimator_spec
-
-
-class _OutfeedHostCall(object):
-  """Support for `eval_metrics` and `host_call` in TPUEstimatorSpec."""
-
-  def __init__(self, ctx):
-    self._ctx = ctx
-    self._names = []
-    # All of these are dictionaries of lists keyed on the name.
-    self._host_fns = {}
-    self._tensor_keys = collections.defaultdict(list)
-    self._tensors = collections.defaultdict(list)
-    self._tensor_dtypes = collections.defaultdict(list)
-    self._tensor_shapes = collections.defaultdict(list)
-
-  @staticmethod
-  def validate(host_calls):
-    """Validates the `eval_metrics` and `host_call` in `TPUEstimatorSpec`."""
-
-    for name, host_call in host_calls.items():
-      if not isinstance(host_call, (tuple, list)):
-        raise ValueError('{} should be tuple or list'.format(name))
-      if len(host_call) != 2:
-        raise ValueError('{} should have two elements.'.format(name))
-      if not callable(host_call[0]):
-        raise TypeError('{}[0] should be callable.'.format(name))
-      if not isinstance(host_call[1], (tuple, list, dict)):
-        raise ValueError('{}[1] should be tuple or list, or dict.'.format(name))
-
-      if isinstance(host_call[1], (tuple, list)):
-        fullargspec = tf_inspect.getfullargspec(host_call[0])
-        fn_args = function_utils.fn_args(host_call[0])
-        # wrapped_hostcall_with_global_step uses varargs, so we allow that.
-        if fullargspec.varargs is None and len(host_call[1]) != len(fn_args):
-          raise RuntimeError(
-              'In TPUEstimatorSpec.{}, length of tensors {} does not match '
-              'method args of the function, which takes {}.'.format(
-                  name, len(host_call[1]), len(fn_args)))
-
-  @staticmethod
-  def create_cpu_hostcall(host_calls):
-    """Runs on the host_call on CPU instead of TPU when use_tpu=False."""
-
-    _OutfeedHostCall.validate(host_calls)
-    ret = {}
-    for name, host_call in host_calls.items():
-      host_fn, tensors = host_call
-      if isinstance(tensors, (tuple, list)):
-        ret[name] = host_fn(*tensors)
-      else:
-        # Must be dict.
-        try:
-          ret[name] = host_fn(**tensors)
-        except TypeError as e:
-          logging.warning(
-              'Exception while calling %s: %s. It is likely the tensors '
-              '(%s[1]) do not match the '
-              'function\'s arguments', name, e, name)
-          raise
-    return ret
-
-  def record(self, host_calls):
-    """Records the host_call structure."""
-
-    for name, host_call in host_calls.items():
-      host_fn, tensor_list_or_dict = host_call
-      self._names.append(name)
-      self._host_fns[name] = host_fn
-
-      if isinstance(tensor_list_or_dict, dict):
-        for (key, tensor) in six.iteritems(tensor_list_or_dict):
-          self._tensor_keys[name].append(key)
-          self._tensors[name].append(tensor)
-          self._tensor_dtypes[name].append(tensor.dtype)
-          self._tensor_shapes[name].append(tensor.shape)
-      else:
-        # List or tuple.
-        self._tensor_keys[name] = None
-        for tensor in tensor_list_or_dict:
-          self._tensors[name].append(tensor)
-          self._tensor_dtypes[name].append(tensor.dtype)
-          self._tensor_shapes[name].append(tensor.shape)
-
-  def create_enqueue_op(self):
-    """Create the op to enqueue the recorded host_calls.
-
-    Returns:
-      A list of enqueue ops, which is empty if there are no host calls.
-    """
-    if not self._names:
-      return []
-
-    tensors = []
-    # TODO(jhseu): Consider deduping tensors.
-    for name in self._names:
-      tensors.extend(self._tensors[name])
-
-    with ops.device(tpu.core(0)):
-      return [tpu_ops.outfeed_enqueue_tuple(tensors)]
-
-  def create_tpu_hostcall(self):
-    """Sends the tensors through outfeed and runs the host_fn on CPU.
-
-    The tensors are concatenated along dimension 0 to form a global tensor
-    across all shards. The concatenated function is passed to the host_fn and
-    executed on the first host.
-
-    Returns:
-      A dictionary mapping name to the return type of the host_call by that
-      name.
-
-    Raises:
-      RuntimeError: If outfeed tensor is scalar.
-    """
-    if not self._names:
-      return {}
-
-    ret = {}
-    # For each i, dequeue_ops[i] is a list containing the tensors from all
-    # shards. This list is concatenated later.
-    dequeue_ops = []
-    tensor_dtypes = []
-    tensor_shapes = []
-    for name in self._names:
-      for _ in self._tensors[name]:
-        dequeue_ops.append([])
-      for dtype in self._tensor_dtypes[name]:
-        tensor_dtypes.append(dtype)
-      for shape in self._tensor_shapes[name]:
-        tensor_shapes.append(shape)
-
-    # Outfeed ops execute on each replica's first logical core. Note: we must
-    # constraint it such that we have at most one outfeed dequeue and enqueue
-    # per replica.
-    for i in xrange(self._ctx.num_replicas):
-      host_device, ordinal_id = self._ctx.device_for_replica(i)
-      with ops.device(host_device):
-        outfeed_tensors = tpu_ops.outfeed_dequeue_tuple(
-            dtypes=tensor_dtypes,
-            shapes=tensor_shapes,
-            device_ordinal=ordinal_id)
-        for j, item in enumerate(outfeed_tensors):
-          dequeue_ops[j].append(item)
-
-    # Deconstruct dequeue ops.
-    flat_dequeue_ops = []
-    for l in dequeue_ops:
-      flat_dequeue_ops.extend(l)
-
-    dequeue_ops_by_name = {}
-    pos = 0
-    for name in self._names:
-      dequeue_ops_by_name[name] = dequeue_ops[pos:pos +
-                                              len(self._tensors[name])]
-      pos += len(self._tensors[name])
-
-    def _call_host_fn(fn, *args, **kw):
-      context = CatchInvalidHostcallFunctions()
-      context.Enter()
-      result = fn(*args, **kw)
-      context.Exit()
-      context.ExitResult(result)
-      return result
-
-    # It is assumed evaluation always happens on single host TPU system. So,
-    # place all ops on tpu host if possible.
-    #
-    # TODO(jhseu): Evaluate whether this is right for summaries.
-    with ops.device(self._ctx.tpu_host_placement_function(replica_id=0)):
-      for name in self._names:
-        dequeue_ops = dequeue_ops_by_name[name]
-        for i, item in enumerate(dequeue_ops):
-          if dequeue_ops[i][0].shape.ndims == 0:
-            raise RuntimeError(
-                'All tensors outfed from TPU should preserve batch size '
-                'dimension, but got scalar {}'.format(dequeue_ops[i][0]))
-          # TODO(xiejw): Make the specification of the outfeed combinaton
-          # function more explicit and well-documented.  We may want to give the
-          # user the option of concatenating along any axis.
-          if (self._ctx.config.tpu_config.per_host_input_for_training is
-              tpu_config.InputPipelineConfig.BROADCAST):
-            # If the infeed is in BROADCAST mode (each core recieving the same
-            # input), then we assume that the cores also produce identical
-            # copies of the same output, and we simply take the output from
-            # the first core.  This mode is used by Mesh-TensorFlow.
-            with ops.control_dependencies(dequeue_ops[i]):
-              dequeue_ops[i] = array_ops.identity(dequeue_ops[i][0])
-          else:
-            # Assume that the input has been batch-split and that axis 0 of the
-            # output tensors represents the batch size.  Concatenate along
-            # the axis 0 to re-combine the batch.
-            dequeue_ops[i] = array_ops.concat(dequeue_ops[i], axis=0)
-
-        if self._tensor_keys[name] is not None:
-          # The user-provided eval_metrics[1] is a dict.
-          dequeue_ops = dict(zip(self._tensor_keys[name], dequeue_ops))
-          try:
-            ret[name] = _call_host_fn(self._host_fns[name], **dequeue_ops)
-          except TypeError as e:
-            logging.warning(
-                'Exception while calling %s: %s. It is likely the tensors '
-                '(%s[1]) do not match the '
-                'function\'s arguments', name, e, name)
-            raise
-        else:
-          ret[name] = _call_host_fn(self._host_fns[name], *dequeue_ops)
-
-    # force all dequeue operations to be run if not consumed by the host calls
-    ret['__force_dequeue'] = control_flow_ops.group(*flat_dequeue_ops)
-    return ret
-
-
-class _OutfeedHostCallHook(session_run_hook.SessionRunHook):
-  """Hook to run host calls when use_tpu=False."""
-
-  def __init__(self, tensors):
-    self._tensors = tensors
-
-  def begin(self):
-    # We duplicate this code from the TPUInfeedOutfeedSessionHook rather than
-    # create a separate hook to guarantee execution order, because summaries
-    # need to be initialized before the outfeed thread starts.
-    # TODO(jhseu): Make a wrapper hook instead?
-    self._init_ops = contrib_summary.summary_writer_initializer_op()
-    # Get all the writer resources from the initializer, so we know what to
-    # flush.
-    self._finalize_ops = []
-    for op in self._init_ops:
-      self._finalize_ops.append(contrib_summary.flush(writer=op.inputs[0]))
-
-  def after_create_session(self, session, coord):
-    session.run(self._init_ops)
-
-  def before_run(self, run_context):
-    return basic_session_run_hooks.SessionRunArgs(self._tensors)
-
-  def end(self, session):
-    session.run(self._finalize_ops)
-
-
-class ExamplesPerSecondHook(basic_session_run_hooks.StepCounterHook):
-  """Calculate and report global_step/sec and examples/sec during runtime."""
-
-  def __init__(self,
-               batch_size,
-               every_n_steps=100,
-               every_n_secs=None,
-               output_dir=None,
-               summary_writer=None):
-    self._batch_size = batch_size
-    super(ExamplesPerSecondHook, self).__init__(
-        every_n_steps=every_n_steps,
-        every_n_secs=every_n_secs,
-        output_dir=output_dir,
-        summary_writer=summary_writer)
-
-  def _log_and_record(self, elapsed_steps, elapsed_time, global_step):
-    global_step_per_sec = elapsed_steps / elapsed_time
-    examples_per_sec = self._batch_size * global_step_per_sec
-    if self._summary_writer is not None:
-      global_step_summary = Summary(value=[
-          Summary.Value(tag='global_step/sec', simple_value=global_step_per_sec)
-      ])
-      example_summary = Summary(value=[
-          Summary.Value(tag='examples/sec', simple_value=examples_per_sec)
-      ])
-      self._summary_writer.add_summary(global_step_summary, global_step)
-      self._summary_writer.add_summary(example_summary, global_step)
-    logging.info('global_step/sec: %g', global_step_per_sec)
-    logging.info('examples/sec: %g', examples_per_sec)
-
-
-class InstallSignalHandlerHook(session_run_hook.SessionRunHook):
-  """Change SIGINT (CTRL^C) handler to force quit the process.
-
-  The default behavior often results in hanging processes.
-  The original handler is restored after training/evaluation.
-  """
-
-  def __init__(self):
-    self._signal_fn = signal.getsignal(signal.SIGINT)
-
-  def before_run(self, run_context):
-    signal.signal(signal.SIGINT, signal.SIG_DFL)
-
-  def end(self, session):
-    signal.signal(signal.SIGINT, self._signal_fn)
-
-
-class ExportSavedModelApiVersion(enum.Enum):
-  V1 = 1
-  V2 = 2
-
-
-class BatchConfig(
-    collections.namedtuple('BatchConfig', [
-        'num_batch_threads', 'max_batch_size', 'batch_timeout_micros',
-        'allowed_batch_sizes', 'max_enqueued_batches'
-    ])):
-  """Class to handle config inputs into the batching function."""
-
-  def __new__(cls,
-              num_batch_threads,
-              max_batch_size,
-              batch_timeout_micros,
-              allowed_batch_sizes,
-              max_enqueued_batches=10):
-    """Creates an EmbeddingConfigSpec instance.
-
-    Args:
-     num_batch_threads: Number of scheduling threads for processing batches of
-       work. Determines the number of batches processed in parallel.
-      max_batch_size: Batch sizes will never be bigger than this.
-      batch_timeout_micros: Maximum number of microseconds to wait before
-        outputting an incomplete batch.
-      allowed_batch_sizes: Optional list of allowed batch sizes. If left empty,
-        does nothing. Otherwise, supplies a list of batch sizes, causing the op
-        to pad batches up to one of those sizes. The entries must increase
-        monotonically, and the final entry must equal max_batch_size.
-      max_enqueued_batches: The maximum depth of the batch queue. Defaults to
-        10.
-
-    Returns:
-      An BatchConfig instance.
-    """
-    return super(BatchConfig, cls).__new__(
-        cls,
-        num_batch_threads=num_batch_threads,
-        max_batch_size=max_batch_size,
-        batch_timeout_micros=batch_timeout_micros,
-        allowed_batch_sizes=allowed_batch_sizes,
-        max_enqueued_batches=max_enqueued_batches)
-
-
-class TPUEstimator(estimator_lib.Estimator):
-  """Estimator with TPU support.
-
-  TPUEstimator also supports training on CPU and GPU. You don't need to define
-  a separate `tf.estimator.Estimator`.
-
-  TPUEstimator handles many of the details of running on TPU devices, such as
-  replicating inputs and models for each core, and returning to host
-  periodically to run hooks.
-
-  TPUEstimator transforms a global batch size in params to a per-shard batch
-  size when calling the `input_fn` and `model_fn`. Users should specify
-  global batch size in constructor, and then get the batch size for each shard
-  in `input_fn` and `model_fn` by `params['batch_size']`.
-
-  - For training, `model_fn` gets per-core batch size; `input_fn` may get
-    per-core or per-host batch size depending on `per_host_input_for_training`
-    in `TPUConfig` (See docstring for TPUConfig for details).
-
-  - For evaluation and prediction, `model_fn` gets per-core batch size and
-    `input_fn` get per-host batch size.
-
-  Evaluation
-  ==========
-
-  `model_fn` should return `TPUEstimatorSpec`, which expects the `eval_metrics`
-  for TPU evaluation. If eval_on_tpu is False, the evaluation will execute on
-  CPU or GPU; in this case the following discussion on TPU evaluation does not
-  apply.
-
-  `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`, where
-  `tensors` could be a list of any nested structure of `Tensor`s (See
-  `TPUEstimatorSpec` for details).  `metric_fn` takes the `tensors` and returns
-  a dict from metric string name to the result of calling a metric function,
-  namely a `(metric_tensor, update_op)` tuple.
-
-  One can set `use_tpu` to `False` for testing. All training, evaluation, and
-  predict will be executed on CPU. `input_fn` and `model_fn` will receive
-  `train_batch_size` or `eval_batch_size` unmodified as `params['batch_size']`.
-
-  Current limitations:
-  --------------------
-
-  1. TPU evaluation only works on a single host (one TPU worker) except
-     BROADCAST mode.
-
-  2. `input_fn` for evaluation should **NOT** raise an end-of-input exception
-     (`OutOfRangeError` or `StopIteration`). And all evaluation steps and all
-     batches should have the same size.
-
-  Example (MNIST):
-  ----------------
-
-  ```
-  # The metric Fn which runs on CPU.
-  def metric_fn(labels, logits):
-    predictions = tf.argmax(logits, 1)
-    return {
-      'accuracy': tf.compat.v1.metrics.precision(
-          labels=labels, predictions=predictions),
-    }
-
-  # Your model Fn which runs on TPU (eval_metrics is list in this example)
-  def model_fn(features, labels, mode, config, params):
-    ...
-    logits = ...
-
-    if mode = tf.estimator.ModeKeys.EVAL:
-      return tpu_estimator.TPUEstimatorSpec(
-          mode=mode,
-          loss=loss,
-          eval_metrics=(metric_fn, [labels, logits]))
-
-  # or specify the eval_metrics tensors as dict.
-  def model_fn(features, labels, mode, config, params):
-    ...
-    final_layer_output = ...
-
-    if mode = tf.estimator.ModeKeys.EVAL:
-      return tpu_estimator.TPUEstimatorSpec(
-          mode=mode,
-          loss=loss,
-          eval_metrics=(metric_fn, {
-              'labels': labels,
-              'logits': final_layer_output,
-          }))
-  ```
-
-  Prediction
-  ==========
-
-  Prediction on TPU is an experimental feature to support large batch inference.
-  It is not designed for latency-critical system. In addition, due to some
-  usability issues, for prediction with small dataset, CPU `.predict`, i.e.,
-  creating a new `TPUEstimator` instance with `use_tpu=False`, might be more
-  convenient.
-
-  Note: In contrast to TPU training/evaluation, the `input_fn` for prediction
-  *should* raise an end-of-input exception (`OutOfRangeError` or
-  `StopIteration`), which serves as the stopping signal to `TPUEstimator`. To be
-  precise, the ops created by `input_fn` produce one batch of the data.
-  The `predict()` API processes one batch at a time. When reaching the end of
-  the data source, an end-of-input exception should be raised by one of these
-  operations. The user usually does not need to do this manually. As long as the
-  dataset is not repeated forever, the `tf.data` API will raise an end-of-input
-  exception automatically after the last batch has been produced.
-
-  Note: Estimator.predict returns a Python generator. Please consume all the
-  data from the generator so that TPUEstimator can shutdown the TPU system
-  properly for user.
-
-  Current limitations:
-  --------------------
-  1. TPU prediction only works on a single host (one TPU worker).
-
-  2. `input_fn` must return a `Dataset` instance rather than `features`. In
-  fact, .train() and .evaluate() also support Dataset as return value.
-
-  Example (MNIST):
-  ----------------
-  ```
-  height = 32
-  width = 32
-  total_examples = 100
-
-  def predict_input_fn(params):
-    batch_size = params['batch_size']
-
-    images = tf.random.uniform(
-        [total_examples, height, width, 3], minval=-1, maxval=1)
-
-    dataset = tf.data.Dataset.from_tensor_slices(images)
-    dataset = dataset.map(lambda images: {'image': images})
-
-    dataset = dataset.batch(batch_size)
-    return dataset
-
-  def model_fn(features, labels, params, mode):
-     # Generate predictions, called 'output', from features['image']
-
-    if mode == tf.estimator.ModeKeys.PREDICT:
-      return tf.contrib.tpu.TPUEstimatorSpec(
-          mode=mode,
-          predictions={
-              'predictions': output,
-              'is_padding': features['is_padding']
-          })
-
-  tpu_est = TPUEstimator(
-      model_fn=model_fn,
-      ...,
-      predict_batch_size=16)
-
-  # Fully consume the generator so that TPUEstimator can shutdown the TPU
-  # system.
-  for item in tpu_est.predict(input_fn=input_fn):
-    # Filter out item if the `is_padding` is 1.
-    # Process the 'predictions'
-  ```
-
-  Exporting
-  =========
-
-  `export_savedmodel` exports 2 metagraphs, one with `saved_model.SERVING`, and
-  another with `saved_model.SERVING` and `saved_model.TPU` tags. At serving
-  time, these tags are used to select the appropriate metagraph to load.
-
-  Before running the graph on TPU, the TPU system needs to be initialized. If
-  TensorFlow Serving model-server is used, this is done automatically. If not,
-  please use `session.run(tpu.initialize_system())`.
-
-  There are two versions of the API: ExportSavedModelApiVersion.V1 and V2.
-
-  In V1, the exported CPU graph is `model_fn` as it is. The exported TPU graph
-  wraps `tpu.rewrite()` and `TPUPartitionedCallOp` around `model_fn` so
-  `model_fn` is on TPU by default. To place ops on CPU,
-  `tpu.outside_compilation(host_call, logits)` can be used.
-
-  ```
-    def model_fn(features, labels, mode, config, params):
-      ...
-      logits = ...
-      export_outputs = {
-        'logits': export_output_lib.PredictOutput(
-          {'logits': logits})
-      }
-
-      def host_call(logits):
-        class_ids = math_ops.argmax(logits)
-        classes = string_ops.as_string(class_ids)
-        export_outputs['classes'] =
-          export_output_lib.ClassificationOutput(classes=classes)
-
-      tpu.outside_compilation(host_call, logits)
-
-      ...
-  ```
-
-  In V2, `export_savedmodel()` sets up `params['use_tpu']` flag to let the user
-  know if the code is exporting to TPU (or not). When `params['use_tpu']` is
-  `True`, users need to call `tpu.rewrite()`, `TPUPartitionedCallOp` and/or
-  `batch_function()`. Alternatively use `inference_on_tpu()` which is a
-  convenience wrapper of the three.
-
-  ```
-    def model_fn(features, labels, mode, config, params):
-      ...
-      # This could be some pre-processing on CPU like calls to input layer with
-      # embedding columns.
-      x2 = features['x'] * 2
-
-      def computation(input_tensor):
-        return layers.dense(
-            input_tensor, 1, kernel_initializer=init_ops.zeros_initializer())
-
-      inputs = [x2]
-      if params['use_tpu']:
-        predictions = array_ops.identity(
-            tpu_estimator.inference_on_tpu(computation, inputs,
-            num_batch_threads=1, max_batch_size=2, batch_timeout_micros=100),
-            name='predictions')
-      else:
-        predictions = array_ops.identity(
-            computation(*inputs), name='predictions')
-      key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-      export_outputs = {
-          key: export_lib.PredictOutput({'prediction': predictions})
-      }
-      ...
-  ```
-
-  TIP: V2 is recommended as it is more flexible (eg: batching, etc).
-
-  """
-
-  def __init__(self,
-               model_fn=None,
-               model_dir=None,
-               config=None,
-               params=None,
-               use_tpu=True,
-               train_batch_size=None,
-               eval_batch_size=None,
-               predict_batch_size=None,
-               batch_axis=None,
-               eval_on_tpu=True,
-               export_to_tpu=True,
-               export_to_cpu=True,
-               warm_start_from=None,
-               experimental_export_device_assignment=False,
-               embedding_config_spec=None,
-               export_saved_model_api_version=ExportSavedModelApiVersion.V1):
-    """Constructs an `TPUEstimator` instance.
-
-    Args:
-      model_fn: Model function as required by `Estimator` which returns
-        EstimatorSpec or TPUEstimatorSpec. `training_hooks`, 'evaluation_hooks',
-        and `prediction_hooks` must not capure any TPU Tensor inside the
-        model_fn.
-      model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator to
-        continue training a previously saved model. If `None`, the model_dir in
-        `config` will be used if set. If both are set, they must be same. If
-        both are `None`, a temporary directory will be used.
-      config: An `tpu_config.RunConfig` configuration object. Cannot be `None`.
-      params: An optional `dict` of hyper parameters that will be passed into
-        `input_fn` and `model_fn`.  Keys are names of parameters, values are
-        basic python types. There are reserved keys for `TPUEstimator`,
-        including 'batch_size'.
-      use_tpu: A bool indicating whether TPU support is enabled. Currently, -
-        TPU training and evaluation respect this bit, but eval_on_tpu can
-        override execution of eval. See below. - Predict still happens on CPU.
-      train_batch_size: An int representing the global training batch size.
-        TPUEstimator transforms this global batch size to a per-shard batch
-        size, as params['batch_size'], when calling `input_fn` and `model_fn`.
-        Cannot be `None` if `use_tpu` is `True`. Must be divisible by total
-        number of replicas.
-      eval_batch_size: An int representing evaluation batch size. Must be
-        divisible by total number of replicas.
-      predict_batch_size: An int representing the prediction batch size. Must be
-        divisible by total number of replicas.
-      batch_axis: A python tuple of int values describing how each tensor
-        produced by the Estimator `input_fn` should be split across the TPU
-        compute shards. For example, if your input_fn produced (images, labels)
-        where the images tensor is in `HWCN` format, your shard dimensions would
-        be [3, 0], where 3 corresponds to the `N` dimension of your images
-        Tensor, and 0 corresponds to the dimension along which to split the
-        labels to match up with the corresponding images. If None is supplied,
-        and per_host_input_for_training is True, batches will be sharded based
-        on the major dimension. If tpu_config.per_host_input_for_training is
-        False or `PER_HOST_V2`, batch_axis is ignored.
-      eval_on_tpu: If False, evaluation runs on CPU or GPU. In this case, the
-        model_fn must return `EstimatorSpec` when called with `mode` as `EVAL`.
-      export_to_tpu: If True, `export_savedmodel()` exports a metagraph for
-        serving on TPU. Note that unsupported export modes such as EVAL will be
-        ignored. For those modes, only a CPU model will be exported.
-        Currently, export_to_tpu only supports PREDICT.
-      export_to_cpu: If True, `export_savedmodel()` exports a metagraph for
-        serving on CPU.
-      warm_start_from: Optional string filepath to a checkpoint or SavedModel to
-        warm-start from, or a `tf.estimator.WarmStartSettings` object to fully
-        configure warm-starting.  If the string filepath is provided instead of
-        a `WarmStartSettings`, then all variables are warm-started, and it is
-        assumed that vocabularies and Tensor names are unchanged.
-      experimental_export_device_assignment: Whether to include the device
-        assignment in the exported model. Doing so is useful in case of model
-        parallel inference but will tie the exported model to the TPU topology
-        used to export the model.
-      embedding_config_spec: Optional EmbeddingConfigSpec instance
-        to support using TPU embedding.
-      export_saved_model_api_version: ExportSavedModelApiVersion, V1 or V2.
-        With V1, `export_savedmodel()` adds rewrite() and TPUPartitionedCallOp()
-        for user; while in v2, user is expected to add rewrite(),
-        TPUPartitionedCallOp() etc in their model_fn.
-        A helper function `inference_on_tpu` is provided for V2.
-        brn_tpu_estimator.py includes examples for both versions
-        i.e. TPUEstimatorExportTest and TPUEstimatorExportV2Test.
-
-    Raises:
-      ValueError: `params` has reserved keys already.
-    """
-    if config is None or not isinstance(config, tpu_config.RunConfig):
-      raise ValueError(
-          '`config` must be provided with type `tpu_config.RunConfig`')
-
-    if params is not None and any(k in params for k in _RESERVED_PARAMS_KEYS):
-      raise ValueError('{} are reserved keys but existed in params {}.'.format(
-          _RESERVED_PARAMS_KEYS, params))
-
-    if use_tpu:
-      # Perform some very basic validations. More validations will be found in
-      # _InternalTPUContext.
-      if train_batch_size is None:
-        raise ValueError('`train_batch_size` cannot be `None`')
-      util_lib.check_positive_integer(train_batch_size, 'train_batch_size')
-
-      if (config.tpu_config.per_host_input_for_training is
-          tpu_config.InputPipelineConfig.PER_SHARD_V1 and
-          config.tpu_config.num_cores_per_replica):
-        raise ValueError(
-            'Model parallelism only supports per host input for training. '
-            'Please adjust TPURunconfig.per_host_input_for_training.')
-
-      if eval_batch_size is not None:
-        util_lib.check_positive_integer(eval_batch_size, 'eval_batch_size')
-
-      if predict_batch_size is not None:
-        util_lib.check_positive_integer(predict_batch_size,
-                                        'predict_batch_size')
-
-      if embedding_config_spec:
-        if (config.tpu_config.per_host_input_for_training !=
-            tpu_config.InputPipelineConfig.PER_HOST_V2):
-          raise ValueError('Only PER_HOST_V2 is supported when using TPU '
-                           'Embedding; got {}.'.format(
-                               config.tpu_config.per_host_input_for_training))
-
-    # Verifies the model_fn signature according to Estimator framework.
-    estimator_lib._verify_model_fn_args(model_fn, params)  # pylint: disable=protected-access
-    # We cannot store config and params in this constructor as parent
-    # constructor might change them, such as assigning a temp dir for
-    # config.model_dir.
-    model_function = self._augment_model_fn(model_fn, batch_axis)
-
-    # Overwrite log_step_count_steps to disable TensorLoggingHook and
-    # StepCounterHook from being created in Estimator. TPUEstimator already
-    # added equivalent hooks in _augment_model_fn above.
-    self._log_every_n_steps = config.log_step_count_steps
-    config = config.replace(log_step_count_steps=None)
-
-    # Passing non-None params as wrapped model_fn has it.
-    params = params or {}
-    super(TPUEstimator, self).__init__(
-        model_fn=model_function,
-        model_dir=model_dir,
-        config=config,
-        params=params,
-        warm_start_from=warm_start_from)
-    self._iterations_per_training_loop = (
-        self._config.tpu_config.iterations_per_loop)
-
-    # All properties passed to _InternalTPUContext are immutable.
-    # pylint: disable=protected-access
-    self._ctx = tpu_context._get_tpu_context(
-        self._config, train_batch_size, eval_batch_size, predict_batch_size,
-        use_tpu, eval_on_tpu, embedding_config_spec)
-
-    self._export_to_cpu = export_to_cpu
-    self._export_to_tpu = export_to_tpu
-    self._experimental_export_device_assignment = (
-        experimental_export_device_assignment)
-
-    if not isinstance(export_saved_model_api_version,
-                      ExportSavedModelApiVersion):
-      raise ValueError('export_saved_model_api_version should be of type '
-                       'ExportSavedModelApiVersion; got {}.'.format(
-                           export_saved_model_api_version))
-    self._export_saved_model_api_version = export_saved_model_api_version
-    self._is_input_fn_invoked = None
-
-    self._rendezvous = {}
-
-  def _add_meta_graph_for_mode(self,
-                               builder,
-                               input_receiver_fn_map,
-                               checkpoint_path,
-                               save_variables=True,
-                               mode=model_fn_lib.ModeKeys.PREDICT,
-                               export_tags=None,
-                               check_variables=True,
-                               strip_default_attrs=True):
-    if self._export_to_tpu and mode != model_fn_lib.ModeKeys.PREDICT:
-      logging.warning('TPUEstimator only handles mode PREDICT for exporting '
-                      'when `export_to_tpu` is `True`; Mode {} will be ignored '
-                      'for TPU.'.format(mode))
-
-    if not self._export_to_cpu and not self._export_to_tpu:
-      raise ValueError('One of export_to_cpu and export_to_tpu must be true.')
-
-    if self._export_to_cpu:
-      (super(TPUEstimator, self)._add_meta_graph_for_mode(
-          builder,
-          input_receiver_fn_map,
-          checkpoint_path,
-          save_variables,
-          mode=mode,
-          export_tags=export_tags,
-          check_variables=check_variables,
-          strip_default_attrs=strip_default_attrs))
-
-    if self._export_to_tpu and mode == model_fn_lib.ModeKeys.PREDICT:
-      input_receiver_fn_map = {
-          _INFERENCE_ON_TPU_MODE: input_receiver_fn_map[mode]
-      }
-      export_tags = [tag_constants.SERVING, tag_constants.TPU]
-      mode = _INFERENCE_ON_TPU_MODE
-
-      # See b/110052256 for why `check_variables` is `False`.
-      if not self._export_to_cpu:
-        check_variables = save_variables = True
-      else:
-        check_variables = save_variables = False
-      (super(TPUEstimator, self)._add_meta_graph_for_mode(
-          builder,
-          input_receiver_fn_map,
-          checkpoint_path,
-          save_variables=save_variables,
-          mode=mode,
-          export_tags=export_tags,
-          check_variables=check_variables,
-          strip_default_attrs=strip_default_attrs))
-
-  def _call_model_fn(self, features, labels, mode, config):
-    if self._export_saved_model_api_version == ExportSavedModelApiVersion.V1:
-      if mode == _INFERENCE_ON_TPU_MODE:
-        return self._call_model_fn_for_inference(features, labels, mode, config)
-      else:
-        return super(TPUEstimator, self)._call_model_fn(features, labels, mode,
-                                                        config)
-    else:
-      return super(TPUEstimator, self)._call_model_fn(features, labels, mode,
-                                                      config)
-
-  def _call_model_fn_for_inference(self, features, labels, mode, config):
-    """Wraps `_call_model_fn` for `export_savedmodel`."""
-    if mode != _INFERENCE_ON_TPU_MODE:
-      raise ValueError('mode must be {}; '
-                       'got {}.'.format(_INFERENCE_ON_TPU_MODE, mode))
-    return model_fn_inference_on_tpu(
-        self._model_fn,
-        features,
-        labels,
-        config,
-        self._params,
-        batch_config=None,
-        experimental_export_device_assignment=self
-        ._experimental_export_device_assignment,
-        call_context=self._ctx)
-
-  def _create_global_step(self, graph):
-    """Creates a global step suitable for TPUs.
-
-    Args:
-      graph: The graph in which to create the global step.
-
-    Returns:
-      A global step `Tensor`.
-
-    Raises:
-      ValueError: if the global step tensor is already defined.
-    """
-    return _create_global_step(graph)
-
-  def _convert_train_steps_to_hooks(self, steps, max_steps):
-    with self._ctx.with_mode(model_fn_lib.ModeKeys.TRAIN) as ctx:
-      if ctx.is_running_on_cpu():
-        return super(TPUEstimator, self)._convert_train_steps_to_hooks(
-            steps, max_steps)
-
-    # On TPU.
-    if steps is None and max_steps is None:
-      raise ValueError(
-          'For TPU training, one of `steps` or `max_steps` must be set. '
-          'Cannot be both `None`.')
-
-    # Estimator.train has explicit positiveness check.
-    if steps is not None:
-      util_lib.check_positive_integer(steps, 'Train steps')
-    if max_steps is not None:
-      util_lib.check_positive_integer(max_steps, 'Train max_steps')
-
-    return [
-        _TPUStopAtStepHook(self._iterations_per_training_loop, steps, max_steps)
-    ]
-
-  def _convert_eval_steps_to_hooks(self, steps):
-    with self._ctx.with_mode(model_fn_lib.ModeKeys.EVAL) as ctx:
-      if ctx.is_running_on_cpu():
-        return super(TPUEstimator, self)._convert_eval_steps_to_hooks(steps)
-
-    if steps is None:
-      raise ValueError('Evaluate `steps` must be set on TPU. Cannot be `None`.')
-
-    util_lib.check_positive_integer(steps, 'Eval steps')
-
-    return [
-        evaluation._StopAfterNEvalsHook(  # pylint: disable=protected-access
-            num_evals=steps),
-        _SetEvalIterationsHook(steps)
-    ]
-
-  def _call_input_fn(self, input_fn, mode):
-    """Calls the input function.
-
-    Args:
-      input_fn: The input function.
-      mode: ModeKeys
-
-    Returns:
-      In TPU mode, returns an input_fn to be called later in model_fn.
-      Otherwise, calls the input_fn and returns either fatures or
-        (features, labels).
-
-    Raises:
-      ValueError: if input_fn takes invalid arguments or does not have `params`.
-    """
-    input_fn_args = function_utils.fn_args(input_fn)
-    config = self.config  # a deep copy.
-    kwargs = {}
-    if 'params' in input_fn_args:
-      kwargs['params'] = self.params  # a deep copy.
-    else:
-      raise ValueError('input_fn ({}) does not include params argument, '
-                       'required by TPUEstimator to pass batch size as '
-                       'params["batch_size"]'.format(input_fn))
-    if 'config' in input_fn_args:
-      kwargs['config'] = config
-
-    if 'mode' in input_fn_args:
-      kwargs['mode'] = mode
-
-    # Records the fact input_fn has been invoked.
-    self._is_input_fn_invoked = True
-
-    with self._ctx.with_mode(mode) as ctx:
-      # Setting the batch size in params first. This helps user to have same
-      # input_fn for use_tpu=True/False.
-      batch_size_for_input_fn = ctx.batch_size_for_input_fn
-      if batch_size_for_input_fn is not None:
-        _add_item_to_params(kwargs['params'], _BATCH_SIZE_KEY,
-                            batch_size_for_input_fn)
-
-      # For export_savedmodel, input_fn is never passed to Estimator. So,
-      # `is_export_mode` must be False.
-      if ctx.is_running_on_cpu(is_export_mode=False):
-        with ops.device('/device:CPU:0'):
-          return input_fn(**kwargs)
-
-      # For TPU computation, input_fn should be invoked in a tf.while_loop for
-      # performance. While constructing the tf.while_loop, the structure of
-      # inputs returned by the `input_fn` needs to be recorded. The structure
-      # includes whether features or labels is dict or single Tensor, dict keys,
-      # tensor shapes, and dtypes. The recorded structure is used to create the
-      # infeed dequeue ops, which must be wrapped and passed as a Fn, called
-      # inside the TPU computation, as the TPU computation is wrapped inside a
-      # tf.while_loop also. So, we either pass input_fn to model_fn or pass
-      # dequeue_fn to model_fn. Here, `input_fn` is passed directly as
-      # `features` in `model_fn` signature.
-      def _input_fn(ctx):
-        _add_item_to_params(kwargs['params'], _CTX_KEY, ctx)
-        return input_fn(**kwargs)
-
-      return _input_fn
-
-  def _validate_features_in_predict_input(self, result):
-    """Skip the validation.
-
-    For TPUEstimator, we do not need to check the result type. `_InputPipeline`
-    has stronger check. Parent class's check generates confusing warning msg.
-
-    Args:
-      result: `features` returned by input_fn.
-    """
-    pass
-
-  def train(self,
-            input_fn,
-            hooks=None,
-            steps=None,
-            max_steps=None,
-            saving_listeners=None):
-    rendezvous = error_handling.ErrorRendezvous(num_sources=3)
-    self._rendezvous[model_fn_lib.ModeKeys.TRAIN] = rendezvous
-    try:
-      return super(TPUEstimator, self).train(
-          input_fn=input_fn,
-          hooks=hooks,
-          steps=steps,
-          max_steps=max_steps,
-          saving_listeners=saving_listeners)
-    except Exception:  # pylint: disable=broad-except
-      rendezvous.record_error('training_loop', sys.exc_info())
-    finally:
-      rendezvous.record_done('training_loop')
-      rendezvous.raise_errors()
-
-  def evaluate(self,
-               input_fn,
-               steps=None,
-               hooks=None,
-               checkpoint_path=None,
-               name=None):
-    rendezvous = error_handling.ErrorRendezvous(num_sources=3)
-    self._rendezvous[model_fn_lib.ModeKeys.EVAL] = rendezvous
-    try:
-      return super(TPUEstimator, self).evaluate(
-          input_fn,
-          steps=steps,
-          hooks=hooks,
-          checkpoint_path=checkpoint_path,
-          name=name)
-    except Exception:  # pylint: disable=broad-except
-      rendezvous.record_error('evaluation_loop', sys.exc_info())
-    finally:
-      rendezvous.record_done('evaluation_loop')
-      rendezvous.raise_errors()
-
-  def predict(self,
-              input_fn,
-              predict_keys=None,
-              hooks=None,
-              checkpoint_path=None,
-              yield_single_examples=True):
-    rendezvous = error_handling.ErrorRendezvous(num_sources=3)
-    self._rendezvous[model_fn_lib.ModeKeys.PREDICT] = rendezvous
-    try:
-      for result in super(TPUEstimator, self).predict(
-          input_fn=input_fn,
-          predict_keys=predict_keys,
-          hooks=hooks,
-          checkpoint_path=checkpoint_path,
-          yield_single_examples=yield_single_examples):
-        yield result
-    except Exception:  # pylint: disable=broad-except
-      rendezvous.record_error('prediction_loop', sys.exc_info())
-    finally:
-      rendezvous.record_done('prediction_loop')
-      rendezvous.raise_errors()
-
-    rendezvous.record_done('prediction_loop')
-    rendezvous.raise_errors()
-
-  def _augment_model_fn(self, model_fn, batch_axis):
-    """Returns a new model_fn, which wraps the TPU support."""
-
-    def _model_fn(features, labels, mode, config, params):
-      """A Estimator `model_fn` for TPUEstimator."""
-
-      # `input_fn` is called in `train()`, `evaluate()`, and `predict()`,
-      # but not in `export_savedmodel()`.
-      if self._is_input_fn_invoked:
-        is_export_mode = False
-      else:
-        is_export_mode = True
-
-      # Clear the bit.
-      self._is_input_fn_invoked = None
-
-      if is_export_mode:
-        if mode == _INFERENCE_ON_TPU_MODE:
-          _add_item_to_params(params, _USE_TPU_KEY, True)
-          mode = model_fn_lib.ModeKeys.PREDICT
-        else:
-          _add_item_to_params(params, _USE_TPU_KEY, False)
-
-      with self._ctx.with_mode(mode) as ctx:
-        model_fn_wrapper = _ModelFnWrapper(model_fn, config, params, ctx)
-
-        # examples_hook is added to training_hooks for both CPU and TPU
-        # execution.
-        if self._log_every_n_steps is not None:
-          examples_hook = ExamplesPerSecondHook(
-              ctx.global_batch_size,
-              # pylint:disable=g-long-ternary
-              output_dir=(self.model_dir
-                          if not config or config.save_summary_steps
-                          else None),
-              # pylint:enable=g-long-ternary
-              every_n_steps=self._log_every_n_steps)
-
-        if ctx.is_running_on_cpu(is_export_mode=is_export_mode):
-          logging.info('Running %s on CPU', mode)
-          estimator_spec = model_fn_wrapper.call_without_tpu(
-              features, labels, is_export_mode=is_export_mode)
-          if self._log_every_n_steps is not None:
-            estimator_spec = estimator_spec._replace(
-                training_hooks=estimator_spec.training_hooks + (examples_hook,))
-          return estimator_spec
-
-        assert labels is None, '`labels` passed to `model_fn` must be `None`.'
-        # TPUEstimator._call_input_fn passes `input_fn` as features to here.
-        assert callable(features), '`input_fn` is not callable.'
-        input_fn = features
-
-        tpu_init_ops = []
-        if ctx.embedding_config and mode == model_fn_lib.ModeKeys.TRAIN:
-          dummy_table_variables, dummy_table_variables_init = (
-              tpu_embedding_gradient.create_dummy_table_variables(
-                  ctx.embedding_config.tpu_embedding))
-          ctx.embedding_config.dummy_table_variables = dummy_table_variables
-          tpu_init_ops.append(dummy_table_variables_init)
-
-        input_holders = _InputPipeline(input_fn, batch_axis, ctx)
-        enqueue_ops, dequeue_fn, input_hooks, run_infeed_loop_on_coordinator = (
-            input_holders.generate_infeed_enqueue_ops_and_dequeue_fn())
-
-        graph = ops.get_default_graph()
-        for enqueue_op in enqueue_ops:
-          if isinstance(enqueue_op, list):
-            graph.get_collection_ref(_TPU_ENQUEUE_OPS).extend(enqueue_op)
-          else:
-            graph.add_to_collection(_TPU_ENQUEUE_OPS, enqueue_op)
-
-        if mode == model_fn_lib.ModeKeys.TRAIN:
-          compile_op, loss, host_call, scaffold_fn, training_hooks = (
-              _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
-          if ctx.embedding_config:
-            g = ops.get_default_graph()
-            table_to_config_dict = (
-                ctx.embedding_config.tpu_embedding.table_to_config_dict)
-            optimization_parameters = (
-                ctx.embedding_config.tpu_embedding.optimization_parameters)
-            embedding_variable_name_by_table, slot_variable_names_by_table = (
-                _tpu_estimator_embedding.get_full_variable_names(
-                    g, table_to_config_dict, optimization_parameters
-                )
-            )
-            embedding_variables_and_ops = (
-                ctx.embedding_config.tpu_embedding.create_variables_and_ops(
-                    embedding_variable_name_by_table,
-                    slot_variable_names_by_table
-                ))
-            tpu_init_ops.extend(embedding_variables_and_ops.load_ops())
-          # scaffold_fn must be called after variables for TPU embedding has
-          # been created on CPU, as user might reinitialize those from some
-          # checkpoint within scaffold_fn.
-          scaffold = _get_scaffold(scaffold_fn)
-
-          host_ops = host_call.create_tpu_hostcall()
-
-          shutdown_hooks = []
-          shutdown_mode = os.environ.get('TF_TPU_GRACEFUL_SHUTDOWN_MODE',
-                                         'reset_computation')
-          if shutdown_mode:
-            if shutdown_mode == 'shutdown_worker':
-              finalizer_hooks = [
-                  session_support.ShutdownLameWorkers(),
-              ]
-            elif shutdown_mode == 'shutdown_all_workers':
-              finalizer_hooks = [
-                  session_support.ShutdownAllWorkers(),
-              ]
-            elif shutdown_mode == 'reset_computation':
-              finalizer_hooks = [
-                  session_support.ResetComputation(),
-              ]
-            elif not shutdown_mode:
-              finalizer_hooks = []
-            else:
-              raise ValueError(
-                  'Unknown TF_TPU_GRACEFUL_SHUTDOWN_MODE "%s"' % shutdown_mode)
-
-            if finalizer_hooks:
-              shutdown_hooks.append(
-                  session_support.GracefulShutdownHook(
-                      checkpoint_prefix=self.model_dir + '/model.ckpt',
-                      on_shutdown_hooks=finalizer_hooks))
-
-          with ops.control_dependencies([loss]):
-            global_step = array_ops.identity(training.get_global_step())
-          hooks = input_hooks + shutdown_hooks
-          hooks.extend([
-              TPUInfeedOutfeedSessionHook(
-                  ctx,
-                  enqueue_ops,
-                  host_ops,
-                  tpu_compile_op=compile_op,
-                  run_infeed_loop_on_coordinator=(
-                      run_infeed_loop_on_coordinator),
-                  rendezvous=self._rendezvous[mode],
-                  master=self._config.master,
-                  session_config=self._session_config,
-                  tpu_init_ops=tpu_init_ops),
-              InstallSignalHandlerHook()
-          ])
-          if tpu_cluster_resolver.is_running_in_gce():
-            hooks.extend(
-                [preempted_hook.CloudTPUPreemptedHook(self._config.cluster)])
-          if self._log_every_n_steps is not None:
-            logging_hook_frequency = (  # Divide and round up
-                (self._log_every_n_steps +
-                 self._config.tpu_config.iterations_per_loop - 1) //
-                self._config.tpu_config.iterations_per_loop)
-            hooks.append(
-                training.LoggingTensorHook({
-                    'loss': array_ops.identity(loss),
-                    'step': global_step,
-                },
-                                           every_n_iter=logging_hook_frequency))
-            examples_hook._set_steps_per_run(  # pylint: disable=protected-access
-                self._config.tpu_config.iterations_per_loop)
-            hooks.append(examples_hook)
-
-          if training_hooks:
-            hooks.extend(training_hooks)
-
-          chief_hooks = []
-          if (self._config.save_checkpoints_secs or
-              self._config.save_checkpoints_steps):
-            checkpoint_hook = training.CheckpointSaverHook(
-                self.model_dir,
-                save_secs=self._config.save_checkpoints_secs,
-                save_steps=self._config.save_checkpoints_steps,
-                scaffold=scaffold)
-            checkpoint_hook._set_steps_per_run(  # pylint: disable=protected-access
-                self._config.tpu_config.iterations_per_loop)
-            chief_hooks.append(checkpoint_hook)
-
-          summary.scalar(model_fn_lib.LOSS_METRIC_KEY, loss)
-          with ops.control_dependencies([loss]):
-            update_ops = _sync_variables_ops(ctx)
-            if ctx.embedding_config:
-              update_ops.extend(embedding_variables_and_ops.retrieve_ops())
-
-          # Validate the TPU training graph to catch basic errors
-          _validate_tpu_training_graph()
-
-          train_op = control_flow_ops.group(*update_ops)
-          graph.add_to_collection(_TPU_TRAIN_OP, train_op)
-
-          return model_fn_lib.EstimatorSpec(
-              mode,
-              loss=loss,
-              training_chief_hooks=chief_hooks,
-              training_hooks=hooks,
-              train_op=train_op,
-              scaffold=scaffold)
-
-        if mode == model_fn_lib.ModeKeys.EVAL:
-          compile_op, total_loss, host_calls, scaffold_fn, eval_hooks = (
-              _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
-          if ctx.embedding_config:
-            g = ops.get_default_graph()
-            table_to_config_dict = (
-                ctx.embedding_config.tpu_embedding.table_to_config_dict)
-            embedding_variable_name_by_table, _ = (
-                _tpu_estimator_embedding.get_full_variable_names(
-                    g, table_to_config_dict)
-            )
-            embedding_variables_and_ops = (
-                ctx.embedding_config.tpu_embedding.create_variables_and_ops(
-                    embedding_variable_name_by_table
-                ))
-            tpu_init_ops.extend(embedding_variables_and_ops.load_ops())
-          # scaffold_fn must be called after variables for TPU embedding has
-          # been created on CPU, as user might reinitialize those from some
-          # checkpoint within scaffold_fn.
-          scaffold = _get_scaffold(scaffold_fn)
-          iterations_per_loop_var = _create_or_get_iterations_per_loop()
-          mean_loss = math_ops.div(
-              total_loss,
-              math_ops.cast(iterations_per_loop_var, dtype=total_loss.dtype))
-
-          with ops.control_dependencies([mean_loss]):
-            # After TPU evaluation computation is done (the mean_loss tensor),
-            # reads all variables back from TPU and updates the eval step
-            # counter properly
-            internal_ops_to_run = _sync_variables_ops(ctx)
-            internal_ops_to_run.append(
-                _increase_eval_step_op(iterations_per_loop_var))
-
-          host_call_ret = host_calls.create_tpu_hostcall()
-          eval_metric_ops = {}
-          eval_update_ops = []
-
-          eval_metrics = host_call_ret.get('eval_metrics', {})
-          if eval_metrics:
-            # Creates a dummy metric update_op for all metrics. Estimator
-            # expects all metrics in `eval_metric_ops` have update_op and calls
-            # them one by one. The real metric update_ops are invoked in a
-            # separated thread. So, here give Estimator the dummy op for all
-            # metrics.
-            with ops.control_dependencies(internal_ops_to_run):
-              dummy_update_op = control_flow_ops.no_op()
-
-            for k, v in eval_metrics.items():
-              eval_metric_ops[k] = (v[0], dummy_update_op)
-              eval_update_ops.append(v[1])
-          else:
-            # If no eval metrics are passed, create an identity node for the
-            # loss and add `internal_ops_to_run` to its dependencies. So
-            # `internal_ops_to_run` can be executed.
-            with ops.control_dependencies(internal_ops_to_run):
-              mean_loss = array_ops.identity(mean_loss)
-
-          if 'host_call' not in host_call_ret:
-            host_ops = []
-          else:
-            host_ops = host_call_ret['host_call']
-          hooks = [
-              TPUInfeedOutfeedSessionHook(
-                  ctx,
-                  enqueue_ops,
-                  eval_update_ops + host_ops,
-                  tpu_compile_op=compile_op,
-                  run_infeed_loop_on_coordinator=(
-                      run_infeed_loop_on_coordinator),
-                  rendezvous=self._rendezvous[mode],
-                  master=self._config.evaluation_master,
-                  session_config=self._session_config,
-                  tpu_init_ops=tpu_init_ops)
-          ] + input_hooks
-
-          if tpu_cluster_resolver.is_running_in_gce():
-            hooks.extend(
-                [preempted_hook.CloudTPUPreemptedHook(self._config.cluster)])
-
-          if eval_hooks:
-            hooks.extend(eval_hooks)
-
-          return model_fn_lib.EstimatorSpec(
-              mode,
-              loss=mean_loss,
-              evaluation_hooks=hooks,
-              eval_metric_ops=eval_metric_ops,
-              scaffold=scaffold)
-
-        # Predict
-        assert mode == model_fn_lib.ModeKeys.PREDICT
-
-        (compile_op, dummy_predict_op, host_calls,
-         scaffold_fn, prediction_hooks) = _predict_on_tpu_system(
-             ctx, model_fn_wrapper, dequeue_fn)
-        scaffold = _get_scaffold(scaffold_fn)
-        with ops.control_dependencies([dummy_predict_op]):
-          internal_ops_to_run = _sync_variables_ops(ctx)
-          with ops.control_dependencies(internal_ops_to_run):
-            dummy_predict_op = control_flow_ops.no_op()
-
-        # In train and evaluation, the main TPU program is passed to monitored
-        # training session to run. Infeed enqueue and outfeed dequeue are
-        # executed in side threads. This is not the configuration for
-        # prediction mode.
-        #
-        # For prediction, the Estimator executes the EstimatorSpec.predictions
-        # directly and yield the element (via generator) to call site. So, the
-        # outfeed based prediction must be passed to MonitoredSession directly.
-        # Other parts of the TPU execution are organized as follows.
-        #
-        # 1. All outfeed based Tensors must be grouped with predictions Tensors
-        #    to form a single invocation. This avoid the issue we might trigger
-        #    multiple outfeeds incorrectly. To achieve this, `host_call` is
-        #    placed in control_dependencies of `stopping_signals`, and
-        #    `stopping_signals` is passed into _StoppingPredictHook, which sets
-        #    the `stopping_signals` as SessionRunArgs. MonitoredSession merges
-        #    all SessionRunArgs with the fetch in session.run together.
-        #
-        # 2. The TPU program (dummy_predict_op) and enqueue_ops (infeed Enqueue)
-        #    are grouped together. They will be launched once and only once in
-        #    side threads and they quit naturally according to the SAME stopping
-        #    condition.
-        enqueue_ops.append(dummy_predict_op)
-
-        host_call_ret = host_calls.create_tpu_hostcall()
-        if 'host_call' not in host_call_ret:
-          host_ops = []
-        else:
-          host_ops = host_call_ret['host_call']
-
-        predictions = host_call_ret['predictions']
-        _verify_cross_hosts_transfer_size(
-            predictions,
-            message=(
-                'The estimated size for TPUEstimatorSpec.predictions is too '
-                'large.'))
-        signals = host_call_ret['signals']
-
-        with ops.control_dependencies(host_ops):
-          host_ops = []  # Empty, we do do not need it anymore.
-          scalar_stopping_signal = _StopSignals.as_scalar_stopping_signal(
-              signals)
-          predictions = _PaddingSignals.slice_tensor_or_dict(
-              predictions, signals)
-
-        hooks = [
-            _StoppingPredictHook(scalar_stopping_signal),
-            TPUInfeedOutfeedSessionHookForPrediction(
-                ctx, enqueue_ops, host_ops, rendezvous=self._rendezvous[mode],
-                tpu_compile_op=compile_op,
-                master=self._config.master,
-                session_config=self._session_config),
-        ] + input_hooks
-
-        if prediction_hooks:
-          hooks.extend(prediction_hooks)
-
-        return model_fn_lib.EstimatorSpec(
-            mode,
-            prediction_hooks=hooks,
-            predictions=predictions,
-            scaffold=scaffold)
-
-    return _model_fn
-
-
-def _export_output_to_tensors(export_output):
-  """Get a list of `Tensors` used in `export_output`.
-
-  Args:
-    export_output: an `ExportOutput` object such as `ClassificationOutput`,
-      `RegressionOutput`, or `PredictOutput`.
-
-  Returns:
-    a list of tensors used in export_output.
-
-  Raises:
-    ValueError: if `export_output` is not one of `ClassificationOutput`,
-        `RegressionOutput`, or `PredictOutput`.
-  """
-  if isinstance(export_output, export_output_lib.ClassificationOutput):
-    return [export_output.scores, export_output.classes]
-  elif isinstance(export_output, export_output_lib.RegressionOutput):
-    return [export_output.value]
-  elif isinstance(export_output, export_output_lib.PredictOutput):
-    return list(export_output.outputs.values())
-  else:
-    raise ValueError(
-        '`export_output` must be have type `ClassificationOutput`, '
-        '`RegressionOutput`, or `PredictOutput`; got {}.'.format(export_output))
-
-
-def _clone_export_output_with_tensors(export_output, tensors):
-  """Clones `export_output` but with new `tensors`.
-
-  Args:
-    export_output: an `ExportOutput` object such as `ClassificationOutput`,
-      `RegressionOutput`, or `PredictOutput`.
-    tensors: a list of `Tensors` used to construct a new `export_output`.
-
-  Returns:
-    A dict similar to `export_output` but with `tensors`.
-
-  Raises:
-    ValueError: if `export_output` is not one of `ClassificationOutput`,
-        `RegressionOutput`, or `PredictOutput`.
-  """
-  if isinstance(export_output, export_output_lib.ClassificationOutput):
-    if len(tensors) != 2:
-      raise ValueError('tensors must be of length 2; '
-                       'got {}.'.format(len(tensors)))
-    return export_output_lib.ClassificationOutput(*tensors)
-  elif isinstance(export_output, export_output_lib.RegressionOutput):
-    if len(tensors) != 1:
-      raise ValueError('tensors must be of length 1; '
-                       'got {}'.format(len(tensors)))
-    return export_output_lib.RegressionOutput(*tensors)
-  elif isinstance(export_output, export_output_lib.PredictOutput):
-    return export_output_lib.PredictOutput(
-        dict(zip(export_output.outputs.keys(), tensors)))
-  else:
-    raise ValueError(
-        '`export_output` must be have type `ClassificationOutput`, '
-        '`RegressionOutput`, or `PredictOutput`; got {}.'.format(export_output))
-
-
-def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
-  """Executes `model_fn_wrapper` multiple times on all TPU shards."""
-  iterations_per_loop_var = _create_or_get_iterations_per_loop()
-
-  (single_tpu_eval_step, host_calls, captured_scaffold_fn, captured_eval_hooks
-  ) = model_fn_wrapper.convert_to_single_tpu_eval_step(dequeue_fn)
-
-  @tpu_function.on_device_training_loop
-  def multi_tpu_eval_steps_on_single_shard():
-    return training_loop.repeat(iterations_per_loop_var, single_tpu_eval_step,
-                                [_ZERO_LOSS])
-
-  (compile_op, loss,) = tpu.split_compile_and_shard(
-      multi_tpu_eval_steps_on_single_shard,
-      inputs=[],
-      num_shards=ctx.num_replicas,
-      outputs_from_all_shards=False,
-      device_assignment=ctx.device_assignment)
-
-  loss = loss[0]
-  return (compile_op, loss, host_calls, captured_scaffold_fn,
-          captured_eval_hooks.get())
-
-
-def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
-  """Executes `model_fn_wrapper` multiple times on all TPU shards."""
-  iterations_per_loop_var = _create_or_get_iterations_per_loop()
-
-  (single_tpu_train_step, host_call, captured_scaffold_fn,
-   captured_training_hooks) = (
-       model_fn_wrapper.convert_to_single_tpu_train_step(dequeue_fn))
-
-  @tpu_function.on_device_training_loop
-  def multi_tpu_train_steps_on_single_shard():
-    return training_loop.repeat(iterations_per_loop_var, single_tpu_train_step,
-                                [_INITIAL_LOSS])
-
-  (compile_op, loss,) = tpu.split_compile_and_shard(
-      multi_tpu_train_steps_on_single_shard,
-      inputs=[],
-      num_shards=ctx.num_replicas,
-      outputs_from_all_shards=False,
-      device_assignment=ctx.device_assignment)
-
-  loss = loss[0]
-  return (compile_op, loss, host_call, captured_scaffold_fn,
-          captured_training_hooks.get())
-
-
-def _predict_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
-  """Executes `model_fn_wrapper` multiple times on all TPU shards."""
-  (single_tpu_predict_step, host_calls, captured_scaffold_fn,
-   captured_predict_hooks
-  ) = model_fn_wrapper.convert_to_single_tpu_predict_step(dequeue_fn)
-
-  @tpu_function.on_device_training_loop
-  def multi_tpu_predict_steps_on_single_shard():
-
-    def cond(scalar_stopping_signal):
-      return math_ops.logical_not(
-          _StopSignals.should_stop(scalar_stopping_signal))
-
-    inputs = [_StopSignals.NON_STOPPING_SIGNAL]
-    outputs = training_loop.while_loop(
-        cond, single_tpu_predict_step, inputs=inputs, name=b'loop')
-    return outputs
-
-  (compile_op, dummy_predict_op,) = tpu.split_compile_and_shard(
-      multi_tpu_predict_steps_on_single_shard,
-      inputs=[],
-      num_shards=ctx.num_replicas,
-      outputs_from_all_shards=False,
-      device_assignment=ctx.device_assignment)
-
-  dummy_predict_op = dummy_predict_op[0]
-  return (compile_op, dummy_predict_op, host_calls, captured_scaffold_fn,
-          captured_predict_hooks.get())
-
-
-def _wrap_computation_in_while_loop(device, op_fn):
-  """Wraps the ops generated by `op_fn` in tf.while_loop."""
-
-  def computation(i):
-    with ops.control_dependencies(op_fn()):
-      return i + 1
-
-  iterations_per_loop_var = _create_or_get_iterations_per_loop()
-  # By setting parallel_iterations=1, the parallel execution in while_loop is
-  # basically turned off.
-  with ops.device(device):
-    iterations = array_ops.identity(iterations_per_loop_var)
-    return control_flow_ops.while_loop(
-        lambda i: i < iterations,
-        computation, [constant_op.constant(0)],
-        parallel_iterations=1)
-
-
-def _wrap_computation_in_while_loop_with_stopping_signals(device, op_fn):
-  """Wraps the ops generated by `op_fn` in tf.while_loop."""
-
-  def cond(scalar_stopping_signal):
-    return math_ops.logical_not(
-        _StopSignals.should_stop(scalar_stopping_signal))
-
-  def computation(unused_scalar_stopping_signal):
-    return_value = op_fn()
-    execute_ops = return_value['ops']
-    signals = return_value['signals']
-    with ops.control_dependencies(execute_ops):
-      return _StopSignals.as_scalar_stopping_signal(signals)
-
-  # By setting parallel_iterations=1, the parallel execution in while_loop is
-  # basically turned off.
-  with ops.device(device):
-    return control_flow_ops.while_loop(
-        cond,
-        computation, [_StopSignals.NON_STOPPING_SIGNAL],
-        parallel_iterations=1)
-
-
-def _validate_tpu_training_graph():
-  """Validate graph before running distributed training.
-
-  Raises:
-    ValueError: If the graph seems invalid for running on device
-  """
-  operations = ops.get_default_graph().get_operations()
-
-  # Check if there is atleast one CrossReplicaSum operation in the graph
-  # This should be introduced by using the CrossShardOptimizer wrapper
-  cross_replica_sum_ops = [
-      o for o in operations if o.type == _CROSS_REPLICA_SUM_OP
-  ]
-  if not cross_replica_sum_ops:
-    raise ValueError(
-        'CrossShardOptimizer must be used for model training on TPUs.')
-
-
-class _CapturedObject(object):
-  """A placeholder to capture an object.
-
-  This is useful when we need to capture a Python object in the Tensorflow
-  control flow body function and use it outside the control flow.
-  """
-
-  def __init__(self):
-    self._object = None
-    self._captured = False
-
-  def capture(self, o):
-    if self._captured:
-      raise RuntimeError(
-          'InternalError: Object can capture only once. Please file bug.')
-
-    self._captured = True
-    self._object = o
-
-  def get(self):
-    if not self._captured:
-      raise RuntimeError(
-          'InternalError: Object is not captured properly before `get`. '
-          'Please file bug.')
-    return self._object
-
-
-def _get_scaffold(captured_scaffold_fn):
-  """Retrieves the Scaffold from `captured_scaffold_fn`."""
-  with _CapturingContext(message='Inside scaffold_fn'):
-    scaffold_fn = captured_scaffold_fn.get()
-    if scaffold_fn:
-      scaffold = scaffold_fn()
-      if scaffold is None:
-        raise ValueError(
-            'TPUEstimatorSpec.scaffold_fn returns None, which is not allowed')
-    else:
-      scaffold = None
-
-  if scaffold:
-    wrapped_finalize = scaffold.finalize
-
-    def _finalize():
-      with _CapturingContext('Inside Scaffold.finalize'):
-        wrapped_finalize()
-
-    scaffold.finalize = _finalize
-  return scaffold
-
-
-class _CapturingContext(control_flow_ops.ControlFlowContext):
-  """Tracks references to Tensors defined in TPU replication."""
-
-  def __init__(self, message):
-    control_flow_ops.ControlFlowContext.__init__(self)
-    self._message = message
-
-  def to_control_flow_context_def(self, context_def, export_scope=None):
-    # pylint: disable=useless-super-delegation
-    # NOTE(slebedev): the method is required by `ControlFlowContext`.
-    super(_CapturingContext, self).to_control_flow_context_def(
-        context_def, export_scope)
-
-  def AddOp(self, op):  # pylint: disable=invalid-name
-    for c in op.inputs:
-      if tpu._TPU_REPLICATE_ATTR in c.op.node_def.attr:  # pylint: disable=protected-access
-        raise ValueError('{}: Op {} depends on TPU computation {}, '
-                         'which is not allowed.'.format(self._message, op, c))
-
-  def __enter__(self):
-    # pylint: disable=protected-access
-    self._g = ops.get_default_graph()
-    self._old = self._g._get_control_flow_context()
-    self._g._set_control_flow_context(self)
-    # pylint: enable=protected-access
-
-  def __exit__(self, _, __, ___):  # pylint: disable=invalid-name
-    self._g._set_control_flow_context(self._old)  # pylint: disable=protected-access
-
-
-class _Inputs(object):
-  """A data structure representing the input_fn returned values.
-
-  This also supports the returned value from input_fn as `Dataset`.
-  """
-
-  def __init__(self, features=None, labels=None, dataset=None, signals=None):
-    if dataset is not None and (features is not None or labels is not None or
-                                signals is not None):
-      raise RuntimeError('Internal Error: Either (features and labels) or '
-                         'dataset should be provided, not both. Please file '
-                         'bug')
-
-    self._features = features
-    self._labels = labels
-    self._signals = signals
-
-    self._dataset = dataset
-    self._iterator = None
-
-  @staticmethod
-  def from_input_fn(return_values):
-    """Returns an `_Inputs` instance according to `input_fn` return value."""
-    if isinstance(return_values, dataset_ops.DatasetV2):
-      dataset = return_values
-      return _Inputs(dataset=dataset)
-
-    features, labels = _Inputs._parse_inputs(return_values)
-    return _Inputs(features, labels)
-
-  @staticmethod
-  def _parse_inputs(return_values):
-    if isinstance(return_values, tuple):
-      features, labels = return_values
-    else:
-      features, labels = return_values, None
-    return features, labels
-
-  @property
-  def is_dataset(self):
-    """Returns True if the return value from input_fn is Dataset."""
-    return self._dataset is not None
-
-  def dataset_initializer(self):
-    """Returns the dataset's initializer.
-
-    The initializer must be run before calling `features_and_labels`.
-    """
-    self._iterator = dataset_ops.make_initializable_iterator(self._dataset)
-    return self._iterator.initializer
-
-  def features_and_labels(self):
-    """Gets `features` and `labels`."""
-    if self.is_dataset:
-      if self._iterator is None:
-        raise RuntimeError('Internal error: Must run dataset_initializer '
-                           'before calling features_and_labels(). Please file '
-                           'a bug!')
-      return _Inputs._parse_inputs(self._iterator.get_next())
-
-    return (self._features, self._labels)
-
-  def signals(self):
-    return self._signals
-
-  @property
-  def dataset(self):
-    return self._dataset
-
-
-class _InputsWithStoppingSignals(_Inputs):
-  """Inputs with `_StopSignals` inserted into the dataset."""
-
-  def __init__(self,
-               dataset,
-               batch_size,
-               add_padding=False,
-               num_invocations_per_step=1):
-
-    assert dataset is not None
-    user_provided_dataset = dataset.map(
-        _InputsWithStoppingSignals.insert_stopping_signal(
-            stop=False, batch_size=batch_size, add_padding=add_padding))
-    if num_invocations_per_step == 1:
-      final_batch_dataset = dataset.take(1).map(
-          _InputsWithStoppingSignals.insert_stopping_signal(
-              stop=True, batch_size=batch_size, add_padding=add_padding))
-    else:
-      # We append (2 * num_invocations_per_step - 1) batches for exhausting the
-      # user_provided_dataset and stop properly.
-      # For example, if num_invocations_per_step is 2, we append 3 additional
-      # padding batches: b1, b2, b3.
-      # If user_provided_dataset contains two batches: a1, a2
-      # Step 1: [a1, a2]
-      # Step 2: [b1, b2] -> STOP
-      # If user_provided_dataset contains three batches: a1, a2, a3.
-      # The training loops:
-      # Step 1: [a1, a2]
-      # Step 2: [a3, b1]
-      # Step 3: [b2, b3] -> STOP.
-      final_batch_dataset = dataset.take(1).map(
-          _InputsWithStoppingSignals.insert_stopping_signal(
-              stop=True, batch_size=batch_size, add_padding=add_padding))
-      final_batch_dataset = final_batch_dataset.repeat(
-          2 * num_invocations_per_step - 1)
-
-      def _set_mask(data_dict):
-        signals = data_dict['signals']
-        signals['padding_mask'] = array_ops.ones_like(signals['padding_mask'])
-        data_dict['signals'] = signals
-        return data_dict
-
-      # Mask out the extra batch.
-      final_batch_dataset = final_batch_dataset.map(_set_mask)
-
-    dataset = user_provided_dataset.concatenate(final_batch_dataset).prefetch(2)
-
-    super(_InputsWithStoppingSignals, self).__init__(dataset=dataset)
-    self._current_inputs = None
-
-  def features_and_labels(self):
-    if self._current_inputs is not None:
-      raise RuntimeError(
-          'Internal Error: The previous inputs have not been properly '
-          'consumed. First call features_and_labels, then call signals.')
-
-    inputs_with_signals = self._iterator.get_next()
-    features = inputs_with_signals['features']
-    labels = inputs_with_signals.get('labels')
-
-    self._current_inputs = inputs_with_signals
-    return features, labels
-
-  def signals(self):
-    """Returns the `Signals` from `_Inputs`."""
-    if self._current_inputs is None:
-      raise RuntimeError(
-          'Internal Error: The current inputs have not been properly '
-          'generated. First call features_and_labels, then call signals.')
-    signals = self._current_inputs['signals']
-    self._current_inputs = None
-    return signals
-
-  @staticmethod
-  def insert_stopping_signal(stop, batch_size, add_padding=False):
-    """Inserts stopping_signal into dataset via _map_fn.
-
-    Here we change the data structure in the dataset, such that the return value
-    is a dictionary now and `features`, `labels`, and `signals` are three
-    distinguished keys in that dict. This provides a better structure, which
-    eases the process to decompose the inputs (see `features_and_labels`).
-
-    Args:
-      stop: bool, state of current stopping signals.
-      batch_size: int, batch size.
-      add_padding: bool, whether to pad the tensor to full batch size.
-
-    Returns:
-      A map_fn passed to dataset.map API.
-    """
-
-    def _map_fn(*args):
-      """The map fn to insert signals."""
-      if len(args) == 1:
-        # Unpack the single Tensor/dict argument as features. This is required
-        # for the input_fn returns no labels.
-        args = args[0]
-      features, labels = _Inputs._parse_inputs(args)
-      new_input_dict = {}
-
-      if add_padding:
-        padding_mask, features, labels = (
-            _PaddingSignals.pad_features_and_labels(features, labels,
-                                                    batch_size))
-
-        new_input_dict['features'] = features
-        if labels is not None:
-          new_input_dict['labels'] = labels
-
-      else:
-        new_input_dict['features'] = features
-        if labels is not None:
-          new_input_dict['labels'] = labels
-        padding_mask = None
-
-      new_input_dict['signals'] = _StopSignals(
-          stop=stop, batch_size=batch_size,
-          padding_mask=padding_mask).as_dict()
-
-      return new_input_dict
-
-    return _map_fn
-
-
-class _StopSignals(object):
-  """Signals class holding all logic to handle TPU stopping condition."""
-
-  NON_STOPPING_SIGNAL = False
-  STOPPING_SIGNAL = True
-
-  def __init__(self, stop, batch_size, padding_mask=None):
-    self._stop = stop
-    self._batch_size = batch_size
-    self._padding_mask = padding_mask
-
-  def as_dict(self):
-    """Returns the signals as Python dict."""
-    shape = [self._batch_size, 1]
-    dtype = dtypes.bool
-
-    if self._stop:
-      stopping = array_ops.ones(shape=shape, dtype=dtype)
-    else:
-      stopping = array_ops.zeros(shape=shape, dtype=dtype)
-
-    signals = {'stopping': stopping}
-    if self._padding_mask is not None:
-      signals['padding_mask'] = self._padding_mask
-    return signals
-
-  @staticmethod
-  def as_scalar_stopping_signal(signals):
-    return array_ops.identity(signals['stopping'][0][0])
-
-  @staticmethod
-  def should_stop(scalar_stopping_signal):
-    """Detects whether scalar_stopping_signal indicates stopping."""
-    if isinstance(scalar_stopping_signal, ops.Tensor):
-      # STOPPING_SIGNAL is a constant True. Here, the logical_and is just the TF
-      # way to express the bool check whether scalar_stopping_signal is True.
-      return math_ops.logical_and(scalar_stopping_signal,
-                                  _StopSignals.STOPPING_SIGNAL)
-    else:
-      # For non Tensor case, it is used in SessionRunHook. So, we cannot modify
-      # the graph anymore. Here, we use pure Python.
-      return bool(scalar_stopping_signal)
-
-
-class _PaddingSignals(object):
-  """Signals class holding all logic to handle padding."""
-
-  @staticmethod
-  def pad_features_and_labels(features, labels, batch_size):
-    """Pads out the batch dimension of features and labels."""
-    real_batch_size = array_ops.shape(
-        _PaddingSignals._find_any_tensor(features))[0]
-
-    batch_size_tensor = constant_op.constant(batch_size, dtypes.int32)
-
-    check_greater = check_ops.assert_greater_equal(
-        batch_size_tensor,
-        real_batch_size,
-        data=(batch_size_tensor, real_batch_size),
-        message='The real batch size should not be greater than batch_size.')
-
-    with ops.control_dependencies([check_greater]):
-      missing_count = batch_size_tensor - real_batch_size
-
-    def pad_single_tensor(tensor):
-      """Pads out the batch dimension of a tensor to the complete batch_size."""
-      rank = len(tensor.shape)
-      assert rank > 0
-      padding = array_ops.stack([[0, missing_count]] + [[0, 0]] * (rank - 1))
-      padded_shape = (batch_size,) + tuple(tensor.shape[1:])
-      padded_tensor = array_ops.pad(tensor, padding)
-      padded_tensor.set_shape(padded_shape)
-      return padded_tensor
-
-    def nest_pad(tensor_or_dict):
-      return nest.map_structure(pad_single_tensor, tensor_or_dict)
-
-    features = nest_pad(features)
-    if labels is not None:
-      labels = nest_pad(labels)
-
-    padding_mask = _PaddingSignals._padding_mask(real_batch_size, missing_count,
-                                                 batch_size)
-
-    return padding_mask, features, labels
-
-  @staticmethod
-  def slice_tensor_or_dict(tensor_or_dict, signals):
-    """Slice the real Tensors according to padding mask in signals."""
-
-    padding_mask = signals['padding_mask']
-    batch_size = array_ops.shape(padding_mask)[0]
-
-    def verify_batch_size(tensor):
-      check_batch_size = math_ops.equal(batch_size, tensor.shape[0])
-      with ops.control_dependencies([check_batch_size]):
-        return array_ops.identity(tensor)
-
-    def slice_single_tensor(tensor):
-      rank = len(tensor.shape)
-      assert rank > 0
-      real_batch_size = batch_size - math_ops.reduce_sum(padding_mask)
-      return verify_batch_size(tensor)[0:real_batch_size]
-
-    # As we split the Tensors to all TPU cores and concat them back, it is
-    # important to ensure the real data is placed before padded ones, i.e.,
-    # order is preserved. By that, the sliced padding mask should have all 0's.
-    # If this assertion failed, # the slice logic here would not hold.
-    sliced_padding_mask = slice_single_tensor(padding_mask)
-    assert_padding_mask = math_ops.equal(
-        math_ops.reduce_sum(sliced_padding_mask), 0)
-
-    with ops.control_dependencies([assert_padding_mask]):
-      should_stop = _StopSignals.should_stop(
-          _StopSignals.as_scalar_stopping_signal(signals))
-
-    is_full_batch = math_ops.equal(math_ops.reduce_sum(padding_mask), 0)
-
-    def slice_fn(tensor):
-      # If the current batch is full batch or part of stopping signals, we do
-      # not need to slice to save performance.
-      return control_flow_ops.cond(
-          math_ops.logical_or(should_stop, is_full_batch),
-          (lambda: verify_batch_size(tensor)),
-          (lambda: slice_single_tensor(tensor)))
-
-    return nest.map_structure(slice_fn, tensor_or_dict)
-
-  @staticmethod
-  def _find_any_tensor(batch_features):
-    tensors = [
-        x for x in nest.flatten(batch_features) if isinstance(x, ops.Tensor)
-    ]
-    if not tensors:
-      raise ValueError('Cannot find any Tensor in features dict.')
-    return tensors[0]
-
-  @staticmethod
-  def _padding_mask(real_batch_size, missing_count, batch_size):
-    padding_mask = array_ops.concat([
-        array_ops.zeros((real_batch_size,), dtype=dtypes.int32),
-        array_ops.ones((missing_count,), dtype=dtypes.int32)
-    ],
-                                    axis=0)
-    padding_mask.set_shape((batch_size,))
-    return padding_mask
-
-
-def _verify_cross_hosts_transfer_size(tensor_dict, message):
-  total_size = 0
-  tensor_structure = {}
-  for key, tensor in tensor_dict.items():
-    shape = tensor.shape
-    size = np.product(shape) * tensor.dtype.size
-    tensor_structure[key] = shape
-    total_size += size
-  if total_size >= _ONE_GIGABYTE:
-    raise ValueError(
-        '{} The transfer size is larger than the protobuf limit. Please '
-        'consider to use Tensors with smaller shapes or reduce batch '
-        'size. Given:\n'
-        '{}'.format(
-            message, '\n'.join([
-                ' -- Key: {}, Shape: {}'.format(k, v)
-                for k, v in tensor_structure.items()
-            ])))
-
-
-def _add_item_to_params(params, key, value):
-  """Adds a new item into `params`."""
-  if hasattr(params, 'set_hparam'):
-    # For HParams, we need to use special API.
-    if key in params:
-      params.set_hparam(key, value)
-    else:
-      params.add_hparam(key, value)
-  else:
-    # Now params is Python dict.
-    params[key] = value
-
-
-def export_estimator_savedmodel(estimator,
-                                export_dir_base,
-                                serving_input_receiver_fn,
-                                assets_extra=None,
-                                as_text=False,
-                                checkpoint_path=None,
-                                strip_default_attrs=False):
-  """Export `Estimator` trained model for TPU inference.
-
-  Args:
-    estimator: `Estimator` with which model has been trained.
-    export_dir_base: A string containing a directory in which to create
-      timestamped subdirectories containing exported SavedModels.
-    serving_input_receiver_fn: A function that takes no argument and returns a
-      `ServingInputReceiver` or `TensorServingInputReceiver`.
-    assets_extra: A dict specifying how to populate the assets.extra directory
-      within the exported SavedModel, or `None` if no extra assets are needed.
-    as_text: whether to write the SavedModel proto in text format.
-    checkpoint_path: The checkpoint path to export.  If `None` (the default),
-      the most recent checkpoint found within the model directory is chosen.
-    strip_default_attrs: Boolean. If `True`, default-valued attributes will be
-      removed from the NodeDefs.
-
-  Returns:
-    The string path to the exported directory.
-  """
-  # `TPUEstimator` requires `tpu_config.RunConfig`, so we cannot use
-  # `estimator.config`.
-  config = tpu_config.RunConfig(model_dir=estimator.model_dir)
-  est = TPUEstimator(
-      estimator._model_fn,  # pylint: disable=protected-access
-      config=config,
-      params=estimator.params,
-      use_tpu=True,
-      train_batch_size=2048,  # Does not matter.
-      eval_batch_size=2048,  # Does not matter.
-  )
-  return est.export_savedmodel(export_dir_base, serving_input_receiver_fn,
-                               assets_extra, as_text, checkpoint_path,
-                               strip_default_attrs)
-
-
-def model_fn_inference_on_tpu(model_fn,
-                              features,
-                              labels=None,
-                              config=None,
-                              params=None,
-                              batch_config=None,
-                              experimental_export_device_assignment=False,
-                              call_context=None):
-  """Convenience wrapper for export_saved_model API v2 for a model_fn.
-
-  It attempts to execute the entire model function on the TPU for prediction.
-  Note that this does not support features which are SparseTensors. If you have
-  SparseTensor features, consider partitioning your model function further and
-  use inference_on_tpu.
-
-  Args:
-    model_fn: the model_fn for which we want to inference on TPU.
-    features: a tensor or dict of tensors, serves as the feature inputs to the
-      model.
-    labels: a tensor or dict of tensors, serves as the labels inputs to the
-      model.
-    config: auxiliary config to the Estimator.
-    params: hparams that we want to pass to the model_fn.
-    batch_config: a named tuple to wrap the inference batching configuration
-      inputs.
-    experimental_export_device_assignment: Whether to include the device
-      assignment in the exported model. Doing so is useful in case of model
-      parallel inference but will tie the exported model to the TPU topology
-      used to export the model.
-    call_context: an optional TPUContext under which the TPU run configuartion
-      is stored.
-
-  Returns:
-    An EstimatorSpec containing the outputs in export_outputs and predictions.
-  """
-  computation, capture = _build_computation_for_inference(
-      model_fn, labels, config, params, experimental_export_device_assignment,
-      call_context)
-  tensors = call_computation(
-      features,
-      computation,
-      experimental_export_device_assignment=
-      experimental_export_device_assignment,
-      batch_config=batch_config)
-  estimator_spec, export_outputs_dict, predictions_dict, none_indices = (
-      capture.get())
-  predictions_list = tensors[:len(predictions_dict)]
-  export_outputs_list_without_none = tensors[len(predictions_dict):]
-
-  # Reinsert `None`s which we've taken out in
-  # `_build_computation_for_inference()`.
-  export_outputs_list = []
-  while none_indices or export_outputs_list_without_none:
-    if none_indices and none_indices[0] == len(export_outputs_list):
-      export_outputs_list.append(None)
-      none_indices.pop(0)
-    else:
-      export_outputs_list.append(export_outputs_list_without_none.pop(0))
-
-  # Reconstruct `export_outputs` with updated tensors.
-  new_export_outputs_dict = nest.pack_sequence_as(export_outputs_dict,
-                                                  export_outputs_list)
-  export_outputs = estimator_spec.export_outputs
-  new_export_outputs = collections.OrderedDict(
-      (k, _clone_export_output_with_tensors(export_outputs[k], v))
-      for k, v in six.iteritems(new_export_outputs_dict))
-  # Reconstruct `predictions` with updated tensors.
-  new_predictions = nest.pack_sequence_as(predictions_dict, predictions_list)
-  if (len(new_predictions) == 1 and
-      _KEY_WHEN_PREDICTIONS_IS_A_TENSOR in new_predictions):
-    new_predictions = new_predictions[_KEY_WHEN_PREDICTIONS_IS_A_TENSOR]
-
-  return estimator_spec._replace(
-      export_outputs=new_export_outputs, predictions=new_predictions)
-
-
-def _build_computation_for_inference(model_fn,
-                                     labels,
-                                     config,
-                                     params,
-                                     experimental_export_device_assignment,
-                                     call_context=None):
-  """Builds the computation with calls the model_fn for inference."""
-  capture = _CapturedObject()
-
-  def computation(computation_input):
-    """Computation to be passed to `TPUPartitionedCall()`."""
-    tpu_computation, tpu_capture = _build_tpu_computation_for_inference(
-        model_fn, computation_input, labels, config, params)
-
-    if experimental_export_device_assignment and call_context:
-      # Export the device assignment as part of the model. This is useful for
-      # model parallel usecases where the model relies on the mapping between
-      # logical and physical devices.
-      with call_context.with_mode(_INFERENCE_ON_TPU_MODE) as ctx:
-        device_assignment = ctx.device_assignment
-    else:
-      device_assignment = None
-
-    if experimental_export_device_assignment:
-      tensors_on_cpu = tpu.rewrite_for_inference(
-          tpu_computation, device_assignment=device_assignment)
-    else:
-      tensors_on_cpu = tpu.rewrite(
-          tpu_computation, device_assignment=device_assignment)
-      tpu.prune_unconnected_ops_from_xla(ops.get_default_graph())
-
-    (estimator_spec, export_outputs_dict, export_outputs_list,
-     predictions_dict) = (
-         tpu_capture.get())
-    predictions_list = tensors_on_cpu[:len(predictions_dict)]
-    export_outputs_tpu_on_cpu_list = tensors_on_cpu[len(predictions_dict):]
-
-    # Reconstruct tensors used in export_outputs, with TPU tensors replaced
-    # with their CPU counterpart returned from `rewrite_for_inference()`.
-    # `function.Defun()` does not like `None`s in return values, so we leave
-    # `None`s out but record their positions for later reconstruction.
-    export_outputs_list_without_none = []
-    none_indices = []
-    for i, t in enumerate(export_outputs_list):
-      if t is None:
-        none_indices.append(i)
-      else:
-        export_outputs_list_without_none.append(
-            export_outputs_tpu_on_cpu_list.pop(0))
-
-    capture.capture(
-        (estimator_spec, export_outputs_dict, predictions_dict, none_indices))
-    return predictions_list + export_outputs_list_without_none
-
-  return computation, capture
-
-
-def _build_tpu_computation_for_inference(model_fn, features, labels, config,
-                                         params):
-  """Builds the TPU computation for inference on TPU."""
-  capture = _CapturedObject()
-
-  def computation():
-    """Compute tpu tensors used in export_outputs.
-
-    Passed to rewrite_for_inference so that model_fn will be called under
-    the rewriting contexts. Only tpu tensors are returned, but export_outputs
-    and scaffold are captured.
-
-    Returns:
-       A list of Tensors used in export_outputs and not marked for
-       outside_compilation.
-    """
-    # We should only call model fn once and it should be inside `computation`
-    # so that building the graph will happen under `rewrite_for_inference`.
-
-    model_fn_args = function_utils.fn_args(model_fn)
-    kwargs = {}
-    # Makes deep copy with `config` and params` in case user mutates them.
-    if 'labels' in model_fn_args:
-      kwargs['labels'] = labels
-    if 'mode' in model_fn_args:
-      kwargs['mode'] = model_fn_lib.ModeKeys.PREDICT
-    if 'config' in model_fn_args:
-      kwargs['config'] = config
-    if 'params' in model_fn_args:
-      kwargs['params'] = params
-    estimator_spec = model_fn(features, **kwargs)
-
-    # We pick the TPU tensors out from `export_output` and later return them
-    # from `computation` for rewriting.
-    export_outputs_dict = collections.OrderedDict(
-        (k, _export_output_to_tensors(v))
-        for k, v in six.iteritems(estimator_spec.export_outputs))
-    export_outputs_list = nest.flatten(export_outputs_dict)
-    export_outputs_tpu_list = [t for t in export_outputs_list if t is not None]
-
-    if isinstance(estimator_spec.predictions, dict):
-      predictions_dict = collections.OrderedDict(
-          (k, v) for k, v in six.iteritems(estimator_spec.predictions))
-    else:
-      predictions_dict = {
-          _KEY_WHEN_PREDICTIONS_IS_A_TENSOR: estimator_spec.predictions
-      }
-    predictions_list = nest.flatten(predictions_dict)
-
-    # We cannot return everything we want through the return values, so
-    # capture the rest here for later use.
-    capture.capture((estimator_spec, export_outputs_dict, export_outputs_list,
-                     predictions_dict))
-    return predictions_list + export_outputs_tpu_list
-
-  return computation, capture
-
-
-def inference_on_tpu(computation,
-                     inputs_to_tpu,
-                     num_batch_threads,
-                     max_batch_size,
-                     batch_timeout_micros,
-                     allowed_batch_sizes=None,
-                     max_enqueued_batches=10):
-  """Convenient wrapper for export_saved_model API v2 to wrap TPU computation.
-
-  It puts computation on TPU, add batching around it and round robin computation
-  between TPU cores.
-
-  See tpu_estimator_test.py for an example.
-
-  Args:
-    computation: computation to be put on TPU, which takes inputs_to_tpu as
-      arguments.
-    inputs_to_tpu: a list of tensors as input to computation.
-    num_batch_threads: Number of scheduling threads for processing batches of
-      work. Determines the number of batches processed in parallel.
-    max_batch_size: Batch sizes will never be bigger than this.
-    batch_timeout_micros: Maximum number of microseconds to wait before
-      outputting an incomplete batch.
-    allowed_batch_sizes: Optional list of allowed batch sizes. If left empty,
-      does nothing. Otherwise, supplies a list of batch sizes, causing the op to
-      pad batches up to one of those sizes. The entries must increase
-      monotonically, and the final entry must equal max_batch_size.
-    max_enqueued_batches: The maximum depth of the batch queue. Defaults to 10.
-
-  Returns:
-    The unbatched computation output Tensors.
-  """
-
-  @batch_ops.batch_function(num_batch_threads, max_batch_size,
-                            batch_timeout_micros, allowed_batch_sizes,
-                            max_enqueued_batches)
-  def batched_tpu_computation(*args):
-
-    @function.Defun(capture_resource_var_by_value=False)
-    def tpu_computation():
-      return tpu.rewrite(computation, args)
-
-    return tpu_functional.TPUPartitionedCall(
-        args=tpu_computation.captured_inputs,
-        device_ordinal=tpu_ops.tpu_ordinal_selector(),
-        Tout=[o.type for o in tpu_computation.definition.signature.output_arg],
-        f=tpu_computation)
-
-  return batched_tpu_computation(*inputs_to_tpu)
+# pylint: disable=wildcard-import,unused-import,redefined-builtin
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import *
+# used by tests
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _clone_export_output_with_tensors
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _create_global_step
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _export_output_to_tensors
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _get_scaffold
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _Inputs
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _ITERATIONS_PER_LOOP_VAR
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _TPU_ENQUEUE_OPS
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _TPU_ESTIMATOR
+from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _TPU_TRAIN_OP
+# pylint: enable=wildcard-import,unused-import,redefined-builtin
diff --git a/tensorflow/python/tpu/tpu_estimator_signals_test.py b/tensorflow/python/tpu/tpu_estimator_signals_test.py
deleted file mode 100644
index ca3eeaa9c9a..00000000000
--- a/tensorflow/python/tpu/tpu_estimator_signals_test.py
+++ /dev/null
@@ -1,339 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""TPU Estimator Signalling Tests."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.client import session
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import test
-from tensorflow.python.tpu import tpu_estimator
-
-
-def make_input_fn(num_samples):
-  a = np.linspace(0, 100.0, num=num_samples)
-  b = np.reshape(np.array(a, dtype=np.float32), (len(a), 1))
-
-  def input_fn(params):
-    batch_size = params['batch_size']
-    da1 = dataset_ops.Dataset.from_tensor_slices(a)
-    da2 = dataset_ops.Dataset.from_tensor_slices(b)
-
-    dataset = dataset_ops.Dataset.zip((da1, da2))
-    dataset = dataset.map(lambda fa, fb: {'a': fa, 'b': fb})
-    dataset = dataset.batch(batch_size)
-    return dataset
-  return input_fn, (a, b)
-
-
-def make_input_fn_with_labels(num_samples):
-  a = np.linspace(0, 100.0, num=num_samples)
-  b = np.reshape(np.array(a, dtype=np.float32), (len(a), 1))
-
-  def input_fn(params):
-    batch_size = params['batch_size']
-    da1 = dataset_ops.Dataset.from_tensor_slices(a)
-    da2 = dataset_ops.Dataset.from_tensor_slices(b)
-
-    dataset = dataset_ops.Dataset.zip((da1, da2))
-    dataset = dataset.map(lambda fa, fb: ({'a': fa}, fb))
-    dataset = dataset.batch(batch_size)
-    return dataset
-  return input_fn, (a, b)
-
-
-class TPUEstimatorStoppingSignalsTest(test.TestCase):
-
-  def test_normal_output_without_signals(self):
-    num_samples = 4
-    batch_size = 2
-
-    params = {'batch_size': batch_size}
-    input_fn, (a, b) = make_input_fn(num_samples=num_samples)
-
-    with ops.Graph().as_default():
-      dataset = input_fn(params)
-      features = dataset_ops.make_one_shot_iterator(dataset).get_next()
-
-      # With tf.data.Dataset.batch, the batch is None, i.e., dynamic shape.
-      self.assertIsNone(features['a'].shape.as_list()[0])
-
-      with session.Session() as sess:
-        result = sess.run(features)
-        self.assertAllEqual(a[:batch_size], result['a'])
-        self.assertAllEqual(b[:batch_size], result['b'])
-
-        # This run should work as num_samples / batch_size = 2.
-        result = sess.run(features)
-        self.assertAllEqual(a[batch_size:num_samples], result['a'])
-        self.assertAllEqual(b[batch_size:num_samples], result['b'])
-
-        with self.assertRaises(errors.OutOfRangeError):
-          # Given num_samples and batch_size, this run should fail.
-          sess.run(features)
-
-  def test_output_with_stopping_signals(self):
-    num_samples = 4
-    batch_size = 2
-
-    params = {'batch_size': batch_size}
-    input_fn, (a, b) = make_input_fn(num_samples=num_samples)
-
-    with ops.Graph().as_default():
-      dataset = input_fn(params)
-      inputs = tpu_estimator._InputsWithStoppingSignals(dataset, batch_size)
-      dataset_initializer = inputs.dataset_initializer()
-      features, _ = inputs.features_and_labels()
-      signals = inputs.signals()
-
-      # With tf.data.Dataset.batch, the batch is None, i.e., dynamic shape.
-      self.assertIsNone(features['a'].shape.as_list()[0])
-
-      with session.Session() as sess:
-        sess.run(dataset_initializer)
-
-        result, evaluated_signals = sess.run([features, signals])
-        self.assertAllEqual(a[:batch_size], result['a'])
-        self.assertAllEqual(b[:batch_size], result['b'])
-        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
-
-        # This run should work as num_samples / batch_size = 2.
-        result, evaluated_signals = sess.run([features, signals])
-        self.assertAllEqual(a[batch_size:num_samples], result['a'])
-        self.assertAllEqual(b[batch_size:num_samples], result['b'])
-        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
-
-        # This run should work, *but* see STOP ('1') as signals
-        _, evaluated_signals = sess.run([features, signals])
-        self.assertAllEqual([[1.]] * batch_size, evaluated_signals['stopping'])
-
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(features)
-
-
-class TPUEstimatorStoppingSignalsWithPaddingTest(test.TestCase):
-
-  def test_num_samples_divisible_by_batch_size(self):
-    num_samples = 4
-    batch_size = 2
-
-    params = {'batch_size': batch_size}
-    input_fn, (a, b) = make_input_fn(num_samples=num_samples)
-
-    with ops.Graph().as_default():
-      dataset = input_fn(params)
-      inputs = tpu_estimator._InputsWithStoppingSignals(dataset, batch_size,
-                                                        add_padding=True)
-      dataset_initializer = inputs.dataset_initializer()
-      features, _ = inputs.features_and_labels()
-      signals = inputs.signals()
-
-      # With padding, all shapes are static now.
-      self.assertEqual(batch_size, features['a'].shape.as_list()[0])
-
-      with session.Session() as sess:
-        sess.run(dataset_initializer)
-
-        result, evaluated_signals = sess.run([features, signals])
-        self.assertAllEqual(a[:batch_size], result['a'])
-        self.assertAllEqual(b[:batch_size], result['b'])
-        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
-        self.assertAllEqual([0.] * batch_size,
-                            evaluated_signals['padding_mask'])
-
-        # This run should work as num_samples / batch_size = 2.
-        result, evaluated_signals = sess.run([features, signals])
-        self.assertAllEqual(a[batch_size:num_samples], result['a'])
-        self.assertAllEqual(b[batch_size:num_samples], result['b'])
-        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
-        self.assertAllEqual([0.] * batch_size,
-                            evaluated_signals['padding_mask'])
-
-        # This run should work, *but* see STOP ('1') as signals
-        _, evaluated_signals = sess.run([features, signals])
-        self.assertAllEqual([[1.]] * batch_size, evaluated_signals['stopping'])
-
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(features)
-
-  def test_num_samples_not_divisible_by_batch_size(self):
-    num_samples = 5
-    batch_size = 2
-
-    params = {'batch_size': batch_size}
-    input_fn, (a, b) = make_input_fn_with_labels(num_samples=num_samples)
-
-    with ops.Graph().as_default():
-      dataset = input_fn(params)
-      inputs = tpu_estimator._InputsWithStoppingSignals(dataset, batch_size,
-                                                        add_padding=True)
-      dataset_initializer = inputs.dataset_initializer()
-      features, labels = inputs.features_and_labels()
-      signals = inputs.signals()
-
-      # With padding, all shapes are static.
-      self.assertEqual(batch_size, features['a'].shape.as_list()[0])
-
-      with session.Session() as sess:
-        sess.run(dataset_initializer)
-
-        evaluated_features, evaluated_labels, evaluated_signals = (
-            sess.run([features, labels, signals]))
-        self.assertAllEqual(a[:batch_size], evaluated_features['a'])
-        self.assertAllEqual(b[:batch_size], evaluated_labels)
-        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
-        self.assertAllEqual([0.] * batch_size,
-                            evaluated_signals['padding_mask'])
-
-        # This run should work as num_samples / batch_size >= 2.
-        evaluated_features, evaluated_labels, evaluated_signals = (
-            sess.run([features, labels, signals]))
-        self.assertAllEqual(a[batch_size:2*batch_size], evaluated_features['a'])
-        self.assertAllEqual(b[batch_size:2*batch_size], evaluated_labels)
-        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
-        self.assertAllEqual([0.] * batch_size,
-                            evaluated_signals['padding_mask'])
-
-        # This is the final partial batch.
-        evaluated_features, evaluated_labels, evaluated_signals = (
-            sess.run([features, labels, signals]))
-        real_batch_size = num_samples % batch_size
-
-        # Assert the real part.
-        self.assertAllEqual(a[2*batch_size:num_samples],
-                            evaluated_features['a'][:real_batch_size])
-        self.assertAllEqual(b[2*batch_size:num_samples],
-                            evaluated_labels[:real_batch_size])
-        # Assert the padded part.
-        self.assertAllEqual([0.0] * (batch_size - real_batch_size),
-                            evaluated_features['a'][real_batch_size:])
-        self.assertAllEqual([[0.0]] * (batch_size - real_batch_size),
-                            evaluated_labels[real_batch_size:])
-
-        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
-
-        padding = ([.0] * real_batch_size
-                   + [1.] * (batch_size - real_batch_size))
-        self.assertAllEqual(padding, evaluated_signals['padding_mask'])
-
-        # This run should work, *but* see STOP ('1') as signals
-        _, evaluated_signals = sess.run([features, signals])
-        self.assertAllEqual([[1.]] * batch_size, evaluated_signals['stopping'])
-
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(features)
-
-  def test_slice(self):
-    num_samples = 3
-    batch_size = 2
-
-    params = {'batch_size': batch_size}
-    input_fn, (a, b) = make_input_fn(num_samples=num_samples)
-
-    with ops.Graph().as_default():
-      dataset = input_fn(params)
-      inputs = tpu_estimator._InputsWithStoppingSignals(dataset, batch_size,
-                                                        add_padding=True)
-      dataset_initializer = inputs.dataset_initializer()
-      features, _ = inputs.features_and_labels()
-      signals = inputs.signals()
-
-      sliced_features = (
-          tpu_estimator._PaddingSignals.slice_tensor_or_dict(
-              features, signals))
-
-      with session.Session() as sess:
-        sess.run(dataset_initializer)
-
-        result, evaluated_signals = sess.run([sliced_features, signals])
-        self.assertAllEqual(a[:batch_size], result['a'])
-        self.assertAllEqual(b[:batch_size], result['b'])
-        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
-
-        # This is the final partial batch.
-        result, evaluated_signals = sess.run([sliced_features, signals])
-        self.assertEqual(1, len(result['a']))
-        self.assertAllEqual(a[batch_size:num_samples], result['a'])
-        self.assertAllEqual(b[batch_size:num_samples], result['b'])
-        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
-
-        # This run should work, *but* see STOP ('1') as signals
-        _, evaluated_signals = sess.run([sliced_features, signals])
-        self.assertAllEqual([[1.]] * batch_size, evaluated_signals['stopping'])
-
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(sliced_features)
-
-  def test_slice_with_multi_invocations_per_step(self):
-    num_samples = 3
-    batch_size = 2
-
-    params = {'batch_size': batch_size}
-    input_fn, (a, b) = make_input_fn(num_samples=num_samples)
-
-    with ops.Graph().as_default():
-      dataset = input_fn(params)
-      inputs = tpu_estimator._InputsWithStoppingSignals(
-          dataset, batch_size, add_padding=True, num_invocations_per_step=2)
-      dataset_initializer = inputs.dataset_initializer()
-      features, _ = inputs.features_and_labels()
-      signals = inputs.signals()
-
-      sliced_features = (
-          tpu_estimator._PaddingSignals.slice_tensor_or_dict(features, signals))
-
-      with session.Session() as sess:
-        sess.run(dataset_initializer)
-
-        result, evaluated_signals = sess.run([sliced_features, signals])
-        self.assertAllEqual(a[:batch_size], result['a'])
-        self.assertAllEqual(b[:batch_size], result['b'])
-        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
-
-        # This is the final partial batch.
-        result, evaluated_signals = sess.run([sliced_features, signals])
-        self.assertEqual(1, len(result['a']))
-        self.assertAllEqual(a[batch_size:num_samples], result['a'])
-        self.assertAllEqual(b[batch_size:num_samples], result['b'])
-        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
-
-        # We should see 3 continuous batches with STOP ('1') as signals and all
-        # of them have mask 1.
-        _, evaluated_signals = sess.run([sliced_features, signals])
-        self.assertAllEqual([[1.]] * batch_size, evaluated_signals['stopping'])
-        self.assertAllEqual([1.] * batch_size,
-                            evaluated_signals['padding_mask'])
-
-        _, evaluated_signals = sess.run([sliced_features, signals])
-        self.assertAllEqual([[1.]] * batch_size, evaluated_signals['stopping'])
-        self.assertAllEqual([1.] * batch_size,
-                            evaluated_signals['padding_mask'])
-
-        _, evaluated_signals = sess.run([sliced_features, signals])
-        self.assertAllEqual([[1.]] * batch_size, evaluated_signals['stopping'])
-        self.assertAllEqual([1.] * batch_size,
-                            evaluated_signals['padding_mask'])
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(sliced_features)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/tpu/tpu_strategy_util.py b/tensorflow/python/tpu/tpu_strategy_util.py
index a832819c37d..7e18c3b1d65 100644
--- a/tensorflow/python/tpu/tpu_strategy_util.py
+++ b/tensorflow/python/tpu/tpu_strategy_util.py
@@ -20,14 +20,11 @@ from __future__ import print_function
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session as session_lib
-from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute.cluster_resolver import TPUClusterResolver
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.tpu import functional as tpu_functional_ops
 from tensorflow.python.tpu import topology
 from tensorflow.python.tpu import tpu
 from tensorflow.python.util import compat
@@ -68,30 +65,25 @@ def initialize_tpu_system(cluster_resolver=None):
     # DistributedTPURewritePass. This pass actually adds real ops that
     # initialize the TPU system. Thus, we can't simply run tpu.initialize_system
     # eagerly. We need to wrap it in defun and trigger the rewrite passes on it.
-    # The easiest way to trigger a rewrite is to run the function with
-    # TPUPartitionedCallOp.
     @function.defun
     def _tpu_init_fn():
       return tpu.initialize_system()
 
-    # We can't call _tpu_init_fn normally (because it contains just a dummy op,
-    # see above) but need to define it to get it added to eager context
-    # and get its assigned name.
-    # pylint: disable=protected-access
-    graph_func = _tpu_init_fn._get_concrete_function_internal()
-    func_name = compat.as_str(graph_func._inference_function.name)
-    # pylint: enable=protected-access
-
     tpu_devices = sorted(
         [x for x in context.list_devices() if "device:TPU:" in x])
 
     if not tpu_devices:
       raise RuntimeError("Could not find any TPU devices")
 
-    with ops.device(device_util.get_host_for_device(tpu_devices[0])):
-      output = tpu_functional_ops.TPUPartitionedCall(
-          args=[], device_ordinal=0, Tout=[dtypes.string], f=func_name)
-    serialized_topology = output[0].numpy()
+    # Replace the remote TPU device with the remote TPU_SYSTEM system device. As
+    # in the remote TPU device case, we will try to compile it instead of
+    # running through optimization passes and TF Executor, but TPU_SYSTEM should
+    # work.
+    tpu_system_device = tpu_devices[0].replace("TPU", "TPU_SYSTEM")
+
+    with ops.device(tpu_system_device):
+      output = _tpu_init_fn()
+    serialized_topology = output.numpy()
   else:
     master = cluster_resolver.master()
     session_config = config_pb2.ConfigProto(allow_soft_placement=True)
diff --git a/tensorflow/python/tpu/util.py b/tensorflow/python/tpu/util.py
index dfb8ce1d182..6e0da240466 100644
--- a/tensorflow/python/tpu/util.py
+++ b/tensorflow/python/tpu/util.py
@@ -1,51 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-
-"""Utilities for the functionalities."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
-import six
-
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import training
-
-def check_positive_integer(value, name):
-  """Checks whether `value` is a positive integer."""
-  if not isinstance(value, six.integer_types):
-    raise TypeError('{} must be int, got {}'.format(name, type(value)))
-
-  if value <= 0:
-    raise ValueError('{} must be positive, got {}'.format(name, value))
-
-
-# TODO(b/118302029) Remove this copy of MultiHostDatasetInitializerHook after we
-# release a tensorflow_estimator with MultiHostDatasetInitializerHook in
-# python/estimator/util.py.
-class MultiHostDatasetInitializerHook(training.SessionRunHook):
-  """Creates a SessionRunHook that initializes all passed iterators."""
-
-  def __init__(self, dataset_initializers):
-    self._initializers = dataset_initializers
-
-  def after_create_session(self, session, coord):
-    del coord
-    start = time.time()
-    session.run(self._initializers)
-    logging.info('Initialized dataset iterators in %d seconds',
-                 time.time() - start)
+# pylint: disable=wildcard-import,unused-import
+from tensorflow_estimator.python.estimator.tpu.util import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/python/training/checkpoint_management.py b/tensorflow/python/training/checkpoint_management.py
index 72556db4cb9..70c95f2701d 100644
--- a/tensorflow/python/training/checkpoint_management.py
+++ b/tensorflow/python/training/checkpoint_management.py
@@ -481,6 +481,7 @@ class CheckpointManager(object):
   """Deletes old checkpoints.
 
   Example usage:
+
   ```python
   import tensorflow as tf
   checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
@@ -497,8 +498,12 @@ class CheckpointManager(object):
   particular directory at a time.
   """
 
-  def __init__(self, checkpoint, directory,
-               max_to_keep, keep_checkpoint_every_n_hours=None):
+  def __init__(self,
+               checkpoint,
+               directory,
+               max_to_keep,
+               keep_checkpoint_every_n_hours=None,
+               checkpoint_name="ckpt"):
     """Configure a `CheckpointManager` for use in `directory`.
 
     If a `CheckpointManager` was previously used in `directory`, its
@@ -537,6 +542,7 @@ class CheckpointManager(object):
         checkpoint will be preserved if it has been at least
         `keep_checkpoint_every_n_hours` since the last preserved checkpoint. The
         default setting of `None` does not preserve any checkpoints in this way.
+      checkpoint_name: Custom name for the checkpoint file.
 
     Raises:
       ValueError: If `max_to_keep` is not a positive integer.
@@ -551,7 +557,7 @@ class CheckpointManager(object):
     self._max_to_keep = max_to_keep
     self._keep_checkpoint_every_n_hours = keep_checkpoint_every_n_hours
     self._directory = directory
-    self._checkpoint_prefix = os.path.join(directory, "ckpt")
+    self._checkpoint_prefix = os.path.join(directory, checkpoint_name)
     recovered_state = get_checkpoint_state(directory)
     current_clock = time.time()
     self._maybe_delete = collections.OrderedDict()
diff --git a/tensorflow/python/training/checkpoint_management_test.py b/tensorflow/python/training/checkpoint_management_test.py
index 053298d1a59..4c40945cb15 100644
--- a/tensorflow/python/training/checkpoint_management_test.py
+++ b/tensorflow/python/training/checkpoint_management_test.py
@@ -558,6 +558,19 @@ class CheckpointManagerTest(test.TestCase):
     # Only the most recent two checkpoints are saved
     self.assertEqual([path, last_path], state.all_model_checkpoint_paths)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testCustomCheckpointPrefix(self):
+    directory = self.get_temp_dir()
+    checkpoint = util.Checkpoint()
+    manager = checkpoint_management.CheckpointManager(
+        checkpoint, directory, max_to_keep=2, checkpoint_name="ckpt_name")
+    path = manager.save(checkpoint_number=5)
+    self.assertEqual(os.path.basename(path), "ckpt_name-5")
+    manager = checkpoint_management.CheckpointManager(
+        checkpoint, directory, max_to_keep=2)
+    path = manager.save(checkpoint_number=5)
+    self.assertEqual(os.path.basename(path), "ckpt-5")
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/experimental/loss_scale_optimizer.py b/tensorflow/python/training/experimental/loss_scale_optimizer.py
index b0d101fd6d5..eaa3f022bee 100644
--- a/tensorflow/python/training/experimental/loss_scale_optimizer.py
+++ b/tensorflow/python/training/experimental/loss_scale_optimizer.py
@@ -119,8 +119,8 @@ class MixedPrecisionLossScaleOptimizer(optimizer.Optimizer):
 
     grads = [g for g, _ in grads_and_vars]
     variables = [v for _, v in grads_and_vars]
-    scaled_grads = self._scale_grads(grads)
-    return list(zip(scaled_grads, variables))
+    unscaled_grads = self._unscale_grads(grads)
+    return list(zip(unscaled_grads, variables))
 
   def _scale_loss(self, loss):
     loss_scale = self._loss_scale()
@@ -128,7 +128,7 @@ class MixedPrecisionLossScaleOptimizer(optimizer.Optimizer):
       return lambda: loss() * loss_scale
     return loss * loss_scale
 
-  def _scale_grads(self, grads):
+  def _unscale_grads(self, grads):
     loss_scale = self._loss_scale()
     loss_scale_reciprical = 1 / loss_scale
     return [
@@ -171,6 +171,7 @@ class MixedPrecisionLossScaleOptimizer(optimizer.Optimizer):
       return self._optimizer.apply_gradients(grads_and_vars, global_step, name)
 
     replica_context = distribution_strategy_context.get_replica_context()
+    grads_and_vars = tuple(grads_and_vars)
 
     # TODO(nluehr) cleanup GraphKeys.TRAIN_OP
     return replica_context.merge_call(
diff --git a/tensorflow/python/training/tracking/BUILD b/tensorflow/python/training/tracking/BUILD
index d9f64fb8335..bf73e25afa4 100644
--- a/tensorflow/python/training/tracking/BUILD
+++ b/tensorflow/python/training/tracking/BUILD
@@ -12,6 +12,10 @@ licenses(["notice"])  # Apache 2.0
 exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load(
+    "//tensorflow/tools/test:performance.bzl",
+    "tf_py_logged_benchmark",
+)
 load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_test")
 
 py_library(
@@ -285,3 +289,18 @@ tf_py_test(
         "//tensorflow/python:framework_test_lib",
     ],
 )
+
+tf_py_test(
+    name = "benchmarks_test",
+    srcs = ["benchmarks_test.py"],
+    additional_deps = [
+        ":util",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework_ops",
+    ],
+)
+
+tf_py_logged_benchmark(
+    name = "benchmarks",
+    target = "//tensorflow/python/training/tracking:benchmarks_test",
+)
diff --git a/tensorflow/python/training/tracking/base.py b/tensorflow/python/training/tracking/base.py
index 6f20c3755df..279d2dbba1b 100644
--- a/tensorflow/python/training/tracking/base.py
+++ b/tensorflow/python/training/tracking/base.py
@@ -19,9 +19,6 @@ from __future__ import print_function
 
 import abc
 import collections
-import functools
-import json
-import weakref
 
 import six
 
@@ -35,7 +32,6 @@ from tensorflow.python.ops import gen_io_ops as io_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.saving import saveable_object
 from tensorflow.python.util import nest
-from tensorflow.python.util import serialization
 from tensorflow.python.util import tf_decorator
 
 # Key where the object graph proto is saved in a TensorBundle
@@ -883,34 +879,7 @@ class Trackable(object):
        lambda name="global_name_for_this_object":
        SaveableObject(name=name, ...)}
     """
-    if not hasattr(self, "get_config"):
-      return {}
-    try:
-      self.get_config()
-    except NotImplementedError:
-      return {}
-    weak_self = weakref.ref(self)
-
-    def _state_callback():
-      """Serializes `self.get_config()` for saving."""
-      dereferenced_self = weak_self()
-      if dereferenced_self:
-        try:
-          return json.dumps(
-              dereferenced_self,
-              default=serialization.get_json_type,
-              sort_keys=True).encode("utf8")
-        except TypeError:
-          # Even if get_config worked objects may have produced garbage.
-          return ""
-      else:
-        return ""
-
-    return {
-        OBJECT_CONFIG_JSON_KEY:
-            functools.partial(
-                PythonStringStateSaveable, state_callback=_state_callback)
-    }
+    return {}
 
   def _list_functions_for_serialization(self):
     """Lists the functions of this trackable to serialize.
diff --git a/tensorflow/python/training/tracking/benchmarks_test.py b/tensorflow/python/training/tracking/benchmarks_test.py
new file mode 100644
index 00000000000..a3cec89cb2d
--- /dev/null
+++ b/tensorflow/python/training/tracking/benchmarks_test.py
@@ -0,0 +1,118 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for checkpoint-related APIs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import time
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.module import module
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training.saving import saveable_object
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import util
+
+
+class _TrivialSaveable(saveable_object.SaveableObject):
+
+  def __init__(self, name):
+    op = lambda: array_ops.ones([])
+    super(_TrivialSaveable, self).__init__(
+        op=op,
+        specs=[saveable_object.SaveSpec(
+            op, "", name, dtype=dtypes.float32, device="CPU:0")],
+        name=name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    return control_flow_ops.no_op()
+
+
+class _TrivialRestore(base.Trackable):
+
+  def _gather_saveables_for_checkpoint(self):
+    return {base.VARIABLE_VALUE_KEY: _TrivialSaveable}
+
+
+class _LazyTrivialObjects(module.Module):
+
+  def __init__(self):
+    self.existing = [_TrivialRestore() for _ in range(5)]
+    self.lazy = []
+
+  def __call__(self):
+    if not self.lazy:
+      self.lazy.extend(_TrivialRestore() for _ in range(5))
+    return
+
+
+def _save_checkpoint():
+  original_checkpoint = util.Checkpoint(m=_LazyTrivialObjects())
+  original_checkpoint.m()
+  return original_checkpoint.write(os.path.join(test.get_temp_dir(), "ckpt"))
+
+
+class SavingBenchmarks(test.Benchmark):
+
+  def _run(self, func, num_iters, execution_mode=None):
+    func()
+    start = time.time()
+    for _ in xrange(num_iters):
+      func()
+    end = time.time()
+    mean_us = (end - start) * 1e6 / num_iters
+    self.report_benchmark(
+        iters=num_iters,
+        wall_time=mean_us,
+        extras={"examples_per_sec": num_iters / (end - start)})
+
+  def benchmark_baseline_no_restore(self):
+
+    def _create_and_call():
+      checkpoint = util.Checkpoint(m=_LazyTrivialObjects())
+      checkpoint.m()
+
+    self._run(_create_and_call, 3)
+
+  def benchmark_batch_restore(self):
+    checkpoint_path = _save_checkpoint()
+
+    def _create_and_call():
+      checkpoint = util.Checkpoint(m=_LazyTrivialObjects())
+      checkpoint.m()
+      checkpoint.restore(checkpoint_path)
+
+    self._run(_create_and_call, 3)
+
+  def benchmark_restore_on_create(self):
+    checkpoint_path = _save_checkpoint()
+
+    def _create_and_call():
+      checkpoint = util.Checkpoint(m=_LazyTrivialObjects())
+      checkpoint.restore(checkpoint_path)
+      checkpoint.m()
+
+    self._run(_create_and_call, 3)
+
+
+if __name__ == "__main__":
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/training/tracking/data_structures.py b/tensorflow/python/training/tracking/data_structures.py
index 73df6872c27..1695e44bad2 100644
--- a/tensorflow/python/training/tracking/data_structures.py
+++ b/tensorflow/python/training/tracking/data_structures.py
@@ -665,7 +665,6 @@ class _DictWrapper(TrackableDataStructure, wrapt.ObjectProxy):
     wrapt.ObjectProxy.__init__(self, wrapped_dict)
     TrackableDataStructure.__init__(self)
     self._self_non_string_key = False
-    self._self_non_append_mutation = False
     self._self_external_modification = False
     self.__wrapped__.update(
         {key: self._track_value(
@@ -690,14 +689,12 @@ class _DictWrapper(TrackableDataStructure, wrapt.ObjectProxy):
   # pylint: disable=protected-access
   def __copy__(self):
     copied = _DictWrapper(copy.copy(self.__wrapped__))
-    copied._self_non_append_mutation = self._self_non_append_mutation
     copied._self_external_modification = self._self_external_modification
     copied._self_non_string_key = self._self_non_string_key
     return copied
 
   def __deepcopy__(self, memo):
     copied = _DictWrapper(copy.deepcopy(self.__wrapped__, memo))
-    copied._self_non_append_mutation = self._self_non_append_mutation
     copied._self_external_modification = self._self_external_modification
     copied._self_non_string_key = self._self_non_string_key
     return copied
@@ -725,15 +722,6 @@ class _DictWrapper(TrackableDataStructure, wrapt.ObjectProxy):
           "checkpointed, wrap it in a tf.contrib.checkpoint.NoDependency "
           "object; it will be automatically un-wrapped and subsequently "
           "ignored." % (self,))
-    if self._self_non_append_mutation:
-      raise ValueError(
-          "Unable to save the object %s (a dictionary wrapper constructed "
-          "automatically on attribute assignment). A key mapping to a "
-          "trackable object was overwritten or deleted, which would "
-          "cause problems for restoration.\n\nIf you don't need this "
-          "dictionary checkpointed, wrap it in a "
-          "tf.contrib.checkpoint.NoDependency object; it will be automatically "
-          "un-wrapped and subsequently ignored." % (self,))
     if self._self_external_modification:
       raise ValueError(
           "Unable to save the object %s (a dictionary wrapper constructed "
@@ -752,7 +740,6 @@ class _DictWrapper(TrackableDataStructure, wrapt.ObjectProxy):
   def _dirty(self):
     """Check if there has already been a mutation which prevents saving."""
     return (self._self_external_modification
-            or self._self_non_append_mutation
             or self._self_non_string_key)
 
   def _check_self_external_modification(self):
@@ -800,39 +787,20 @@ class _DictWrapper(TrackableDataStructure, wrapt.ObjectProxy):
     self._maybe_initialize_trackable()
     no_dep = isinstance(value, NoDependency)
     if isinstance(key, six.string_types):
-      existing_dependency = self._lookup_dependency(key)
       value = self._track_value(value, name=key)
     else:
       value = _wrap_or_unwrap(value)
-      existing_dependency = None
       if not no_dep and isinstance(value, base.Trackable):
         # Non-string keys are OK as long as we have no reason to add a
         # dependency on the value (either because the value is not
         # trackable, or because it was wrapped in a NoDependency object).
         self._self_non_string_key = True
-    if key in self.__wrapped__:
-      previous_value = self.__wrapped__[key]
-      if previous_value is not value:
-        if ((not no_dep and isinstance(value, base.Trackable))
-            # We don't want to just check that the existing object is
-            # trackable, since it may have been wrapped in a NoDependency
-            # object.
-            or existing_dependency is not None):
-          # A trackable object was replaced under the same key; this means
-          # that restoring would be error-prone, so we'll throw an exception on
-          # save.
-          self._self_non_append_mutation = True
     self.__wrapped__[key] = value
 
     self._update_snapshot()
 
   def __delitem__(self, key):
     self._check_self_external_modification()
-    existing_value = self[key]
-    if isinstance(existing_value, base.Trackable):
-      # Deleting tracked trackable values means restoring is problematic,
-      # so we'll throw an exception on save.
-      self._self_non_append_mutation = True
     del self.__wrapped__[key]
     self._update_snapshot()
 
diff --git a/tensorflow/python/training/tracking/data_structures_test.py b/tensorflow/python/training/tracking/data_structures_test.py
index 2746c40e8da..42d75df460d 100644
--- a/tensorflow/python/training/tracking/data_structures_test.py
+++ b/tensorflow/python/training/tracking/data_structures_test.py
@@ -663,15 +663,6 @@ class MappingTests(test.TestCase):
     model.save_weights(save_path)
     model.load_weights(save_path)
 
-  def testDelNoSave(self):
-    model = training.Model()
-    model.d = {}
-    model.d["a"] = []
-    del model.d["a"]
-    save_path = os.path.join(self.get_temp_dir(), "ckpt")
-    with self.assertRaisesRegexp(ValueError, "overwritten or deleted"):
-      model.save_weights(save_path)
-
   def testPopNoSave(self):
     model = training.Model()
     model.d = {}
@@ -690,14 +681,13 @@ class MappingTests(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "modified outside the wrapper"):
       model.save_weights(save_path)
 
-  def testOverwriteNoSave(self):
+  def testOverwriteCanStillSave(self):
     model = training.Model()
     model.d = {}
     model.d["a"] = {}
     model.d["a"] = {}
     save_path = os.path.join(self.get_temp_dir(), "ckpt")
-    with self.assertRaisesRegexp(ValueError, "overwritten or deleted"):
-      model.save_weights(save_path)
+    model.save_weights(save_path)
 
   def testIter(self):
     model = training.Model()
diff --git a/tensorflow/python/training/tracking/util.py b/tensorflow/python/training/tracking/util.py
index 19934c46f97..7a4ad538eca 100644
--- a/tensorflow/python/training/tracking/util.py
+++ b/tensorflow/python/training/tracking/util.py
@@ -201,8 +201,12 @@ class _CheckpointRestoreCoordinator(object):
     """
     restore_ops = []
     # Eagerly run restorations for Python state.
-    reader = pywrap_tensorflow.NewCheckpointReader(self.save_path_string)
+    reader = None
     for saveable in python_saveables:
+      if reader is None:
+        # Lazily create the NewCheckpointReader, since this requires file access
+        # and we may not have any Python saveables.
+        reader = pywrap_tensorflow.NewCheckpointReader(self.save_path_string)
       spec_names = [spec.name for spec in saveable.specs]
       saveable.python_restore([reader.get_tensor(name) for name in spec_names])
 
@@ -612,8 +616,10 @@ def streaming_restore(status, session=None):
     session = keras_backend.get_session()
   if isinstance(status, NameBasedSaverStatus):
     raise NotImplementedError(
-        "Streaming restore not supported from name-based checkpoints. File a "
-        "feature request if this limitation bothers you.")
+        "Streaming restore not supported from name-based checkpoints when "
+        "graph building. File a feature request if this limitation bothers "
+        "you. As a workaround, consider either using tf.train.Checkpoint to "
+        "load name-based checkpoints or enabling eager execution.")
   status.run_restore_ops(session=session)
   # pylint: disable=protected-access
   status._checkpoint.new_restore_ops_callback = (
diff --git a/tensorflow/python/training/tracking/util_test.py b/tensorflow/python/training/tracking/util_test.py
index c4b72f0162b..a08fe868be6 100644
--- a/tensorflow/python/training/tracking/util_test.py
+++ b/tensorflow/python/training/tracking/util_test.py
@@ -17,14 +17,12 @@ from __future__ import division
 from __future__ import print_function
 
 import functools
-import json
 import os
 import weakref
 
 from absl.testing import parameterized
 import six
 
-from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
@@ -326,12 +324,6 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
     expected_checkpoint_names = [
         name + suffix for name in expected_checkpoint_names]
-    expected_checkpoint_names.append(
-        "optimizer/.ATTRIBUTES/OBJECT_CONFIG_JSON")
-    # The Dense layers also save get_config() JSON
-    expected_checkpoint_names.extend(
-        ["model/_second/.ATTRIBUTES/OBJECT_CONFIG_JSON",
-         "model/_named_dense/.ATTRIBUTES/OBJECT_CONFIG_JSON"])
     named_variables = {v.name: v for v in named_variables}
     six.assertCountEqual(self, expected_checkpoint_names,
                          named_variables.keys())
@@ -1620,48 +1612,6 @@ class CheckpointCompatibilityTests(test.TestCase):
         self._check_sentinels(root)
 
 
-class PythonMetadataTests(test.TestCase):
-
-  @test_util.run_in_graph_and_eager_modes
-  def testSaveLoad(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    dense = core.Dense(1)
-    checkpoint = trackable_utils.Checkpoint(dense=dense)
-    dense(constant_op.constant([[1.]]))
-    checkpoint.restore(None).initialize_or_restore()
-    save_path = checkpoint.save(checkpoint_prefix)
-
-    def _get_dense_node_from_object_graph(object_graph_proto):
-      root_node = object_graph_proto.nodes[0]
-      for child in root_node.children:
-        if child.local_name == "dense":
-          break
-      else:
-        raise AssertionError(
-            "Expected a 'dense' dependency of root, didn't find one.")
-      dense_node = object_graph_proto.nodes[child.node_id]  # pylint: disable=undefined-loop-variable
-      self.assertEqual(1, len(dense_node.attributes))
-      reader = pywrap_tensorflow.NewCheckpointReader(save_path)
-      layer_json = reader.get_tensor(dense_node.attributes[0].checkpoint_key)
-      return json.loads(layer_json.decode("utf-8"))
-
-    layer_data = _get_dense_node_from_object_graph(
-        trackable_utils.object_metadata(save_path))
-    self.assertEqual("Dense", layer_data["class_name"])
-    self.assertEqual(1, layer_data["config"]["units"])
-
-    # Check that no new ops are added to the graph the second time we save.
-    ops.get_default_graph().finalize()
-
-    dense.units = 42
-    save_path = checkpoint.save(checkpoint_prefix)
-    layer_data = _get_dense_node_from_object_graph(
-        trackable_utils.object_metadata(save_path))
-    self.assertEqual("Dense", layer_data["class_name"])
-    self.assertEqual(42, layer_data["config"]["units"])
-
-
 if __name__ == "__main__":
   ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/training/tracking/util_with_v1_optimizers_test.py b/tensorflow/python/training/tracking/util_with_v1_optimizers_test.py
index e00131a8e46..c36790b4fb4 100644
--- a/tensorflow/python/training/tracking/util_with_v1_optimizers_test.py
+++ b/tensorflow/python/training/tracking/util_with_v1_optimizers_test.py
@@ -125,10 +125,6 @@ class CheckpointingTests(test.TestCase):
     suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
     expected_checkpoint_names = [
         name + suffix for name in expected_checkpoint_names]
-    # The Dense layers also save get_config() JSON
-    expected_checkpoint_names.extend(
-        ["model/_second/.ATTRIBUTES/OBJECT_CONFIG_JSON",
-         "model/_named_dense/.ATTRIBUTES/OBJECT_CONFIG_JSON"])
     named_variables = {v.name: v for v in named_variables}
     six.assertCountEqual(self, expected_checkpoint_names,
                          named_variables.keys())
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index a262e4c8584..a86bb29833d 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -550,7 +550,9 @@ ValueIteratorPtr GetValueIteratorForComposite(PyObject* nested) {
     if (PyErr_Occurred() || nested == nullptr) {
       return absl::make_unique<ErrorValueIterator>();
     }
-    return absl::make_unique<SingleValueIterator>(nested);
+    ValueIteratorPtr result = absl::make_unique<SingleValueIterator>(nested);
+    Py_DECREF(nested);  // ValueIterator took ownership
+    return result;
   }
   return GetValueIterator(nested);
 }
@@ -733,18 +735,22 @@ bool AssertSameStructureHelper(
   if (check_composite_tensor_metadata && IsCompositeTensor(o1)) {
     if (!IsCompositeTensor(o2)) return false;
     static char _to_component_metadata[] = "_component_metadata";
-    PyObject* m1 = PyObject_CallMethod(o1, _to_component_metadata, nullptr);
+    Safe_PyObjectPtr m1(
+        PyObject_CallMethod(o1, _to_component_metadata, nullptr));
     if (PyErr_Occurred() || m1 == nullptr) return false;
-    PyObject* m2 = PyObject_CallMethod(o2, _to_component_metadata, nullptr);
-    if (PyErr_Occurred() || m2 == nullptr) return false;
-    if (PyObject_RichCompareBool(m1, m2, Py_NE)) {
+    Safe_PyObjectPtr m2(
+        PyObject_CallMethod(o2, _to_component_metadata, nullptr));
+    if (PyErr_Occurred() || m2 == nullptr) {
+      return false;
+    }
+    if (PyObject_RichCompareBool(m1.get(), m2.get(), Py_NE)) {
       *is_type_error = false;
       *error_msg = tensorflow::strings::StrCat(
           "The two CompositeTensors have different metadata. "
           "First CompositeTensor ",
-          PyObjectToString(o1), " has metadata ", PyObjectToString(m1),
+          PyObjectToString(o1), " has metadata ", PyObjectToString(m1.get()),
           ", while second structure ", PyObjectToString(o2), " has metadata ",
-          PyObjectToString(m2));
+          PyObjectToString(m2.get()));
       return false;
     }
   }
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index b12a661cd3c..84d07806bde 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -666,6 +666,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "device_memory_allocator",
+    srcs = [
+        "device_memory_allocator.cc",
+        "owning_device_memory.cc",
+    ],
+    hdrs = [
+        "device_memory_allocator.h",
+        "owning_device_memory.h",
+    ],
+    deps = [
+        ":platform",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 tf_cc_test(
     name = "stream_test",
     size = "small",
diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD
index fa456bf76d3..3be9bd59495 100644
--- a/tensorflow/stream_executor/cuda/BUILD
+++ b/tensorflow/stream_executor/cuda/BUILD
@@ -13,7 +13,10 @@ load(
     "tf_additional_cupti_stub_data",
 )
 load("//tensorflow:tensorflow.bzl", "tf_copts")
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
+load(
+    "//tensorflow/core:platform/default/cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
 load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
 
 package_group(
@@ -198,12 +201,22 @@ cc_library(
     ]),
 )
 
+alias(
+    name = "cublas_lib",
+    actual = if_static(
+        "@local_config_cuda//cuda:cublas",
+        ":cublas_stub",
+    ),
+    visibility = ["//visibility:private"],
+)
+
 cc_library(
     name = "cublas_plugin",
     srcs = if_cuda_is_configured(["cuda_blas.cc"]),
     hdrs = if_cuda_is_configured(["cuda_blas.h"]),
     visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
+        ":cublas_lib",
         ":cuda_activation",
         ":cuda_gpu_executor",
         ":cuda_platform_id",
@@ -223,10 +236,7 @@ cc_library(
         "//tensorflow/stream_executor/gpu:gpu_helpers_header",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
-    ] + if_static(
-        ["@local_config_cuda//cuda:cublas"],
-        [":cublas_stub"],
-    )) + [
+    ]) + [
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
     ],
@@ -244,6 +254,15 @@ cc_library(
     ]),
 )
 
+alias(
+    name = "cufft_lib",
+    actual = if_static(
+        "@local_config_cuda//cuda:cufft",
+        ":cufft_stub",
+    ),
+    visibility = ["//visibility:private"],
+)
+
 cc_library(
     name = "cufft_plugin",
     srcs = if_cuda_is_configured(["cuda_fft.cc"]),
@@ -255,6 +274,7 @@ cc_library(
         ":cuda_platform_id",
         ":cuda_stream",
         ":cuda_helpers",
+        ":cufft_lib",
         "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/stream_executor:event",
         "//tensorflow/stream_executor:fft",
@@ -264,10 +284,7 @@ cc_library(
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
         "//tensorflow/stream_executor/platform:dso_loader",
-    ] + if_static(
-        ["@local_config_cuda//cuda:cufft"],
-        [":cufft_stub"],
-    )),
+    ]),
     alwayslink = True,
 )
 
@@ -282,6 +299,15 @@ cc_library(
     ]),
 )
 
+alias(
+    name = "cudnn_lib",
+    actual = if_static(
+        "@local_config_cuda//cuda:cudnn",
+        ":cudnn_stub",
+    ),
+    visibility = ["//visibility:private"],
+)
+
 cc_library(
     name = "cudnn_plugin",
     srcs = if_cuda_is_configured(["cuda_dnn.cc"]),
@@ -296,6 +322,7 @@ cc_library(
         ":cuda_stream",
         ":cuda_timer",
         ":cudnn_version",
+        ":cudnn_lib",
         "@com_google_absl//absl/strings",
         "//third_party/eigen3",
         "@local_config_cuda//cuda:cuda_headers",
@@ -309,10 +336,7 @@ cc_library(
         "//tensorflow/stream_executor:temporary_device_memory",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
-    ]) + tf_additional_cudnn_plugin_deps() + if_cuda_is_configured(if_static(
-        ["@local_config_cuda//cuda:cudnn"],
-        [":cudnn_stub"],
-    )) + ["@com_google_absl//absl/synchronization"],
+    ]) + tf_additional_cudnn_plugin_deps() + ["@com_google_absl//absl/synchronization"],
     alwayslink = True,
 )
 
@@ -327,6 +351,15 @@ cc_library(
     ]),
 )
 
+alias(
+    name = "curand_lib",
+    actual = if_static(
+        "@local_config_cuda//cuda:curand",
+        ":curand_stub",
+    ),
+    visibility = ["//visibility:private"],
+)
+
 cc_library(
     name = "curand_plugin",
     srcs = if_cuda_is_configured(["cuda_rng.cc"]),
@@ -337,6 +370,7 @@ cc_library(
         ":cuda_platform_id",
         ":cuda_stream",
         ":cuda_helpers",
+        ":curand_lib",
         "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/stream_executor:event",
         "//tensorflow/stream_executor:plugin_registry",
@@ -346,10 +380,7 @@ cc_library(
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
         "//tensorflow/stream_executor/platform:dso_loader",
-    ] + if_static(
-        ["@local_config_cuda//cuda:curand"],
-        [":curand_stub"],
-    )),
+    ]),
     alwayslink = True,
 )
 
diff --git a/tensorflow/stream_executor/cuda/cublas_stub.cc b/tensorflow/stream_executor/cuda/cublas_stub.cc
index beca8c1a46f..b8e203fe235 100644
--- a/tensorflow/stream_executor/cuda/cublas_stub.cc
+++ b/tensorflow/stream_executor/cuda/cublas_stub.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "cuda/include/cublas.h"
-#include "cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cublas.h"
+#include "third_party/gpus/cuda/include/cuda.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/platform/dso_loader.h"
 
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index 661d84512a2..421b9b4ce42 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "cuda/include/cublas_v2.h"
-#include "cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cublas_v2.h"
+#include "third_party/gpus/cuda/include/cuda.h"
 
 #define SE_CUDA_DATA_HALF CUDA_R_16F
 
@@ -40,7 +40,7 @@ limitations under the License.
 // TODO(b/73793421): Remove the following code block to switch to the second
 // approach when the issue is fixed.
 #if CUDA_VERSION < 9000
-#include "cuda/include/cuda_fp16.h"
+#include "third_party/gpus/cuda/include/cuda_fp16.h"
 #define EIGEN_HAS_CUDA_FP16
 #endif
 
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 12da285bb6a..8fc0549b01b 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -43,7 +43,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 // clang-format off
-#include "cuda/include/cudnn.h"
+#include "third_party/gpus/cudnn/cudnn.h"
 #include "absl/strings/string_view.h"
 // clang-format on
 
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index f41f64f6aaa..91b5014339c 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
-#include "cuda/include/cuda_runtime_api.h"
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/error.h"
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.h b/tensorflow/stream_executor/cuda/cuda_fft.h
index 0f3baeab6fa..e7b0d66cb13 100644
--- a/tensorflow/stream_executor/cuda/cuda_fft.h
+++ b/tensorflow/stream_executor/cuda/cuda_fft.h
@@ -20,7 +20,7 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_FFT_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_FFT_H_
 
-#include "cuda/include/cufft.h"
+#include "third_party/gpus/cuda/include/cufft.h"
 #include "tensorflow/stream_executor/fft.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
diff --git a/tensorflow/stream_executor/cuda/cuda_rng.cc b/tensorflow/stream_executor/cuda/cuda_rng.cc
index b86c3122fb1..23f22bae9a1 100644
--- a/tensorflow/stream_executor/cuda/cuda_rng.cc
+++ b/tensorflow/stream_executor/cuda/cuda_rng.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/rng.h"
 // clang-format off
-#include "cuda/include/curand.h"
+#include "third_party/gpus/cuda/include/curand.h"
 // clang-format on
 
 // Formats curandStatus_t to output prettified values into a log stream.
diff --git a/tensorflow/stream_executor/cuda/cuda_runtime_10_0.inc b/tensorflow/stream_executor/cuda/cuda_runtime_10_0.inc
index b9523ec2690..9b912330512 100644
--- a/tensorflow/stream_executor/cuda/cuda_runtime_10_0.inc
+++ b/tensorflow/stream_executor/cuda/cuda_runtime_10_0.inc
@@ -383,22 +383,6 @@ cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr,
   return func_ptr(stream, devPtr, length, flags);
 }
 
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamBeginCapture(cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamBeginCapture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t *pGraph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaGraph_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamEndCapture");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stream, pGraph);
-}
-
 extern __host__ cudaError_t CUDARTAPI cudaStreamIsCapturing(
     cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus) {
   using FuncPtr =
@@ -1524,306 +1508,6 @@ cudaRuntimeGetVersion(int *runtimeVersion) {
   return func_ptr(runtimeVersion);
 }
 
-extern __host__ cudaError_t CUDARTAPI cudaGraphCreate(cudaGraph_t *pGraph,
-                                                      unsigned int flags) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t *, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraph, flags);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddKernelNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-                       cudaGraphNode_t *pDependencies, size_t numDependencies,
-                       const struct cudaKernelNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           cudaGraphNode_t *, size_t,
-                                           const struct cudaKernelNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddKernelNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetParams(
-    cudaGraphNode_t node, struct cudaKernelNodeParams *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaKernelNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetParams(
-    cudaGraphNode_t node, const struct cudaKernelNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaKernelNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddMemcpyNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-                       cudaGraphNode_t *pDependencies, size_t numDependencies,
-                       const struct cudaMemcpy3DParms *pCopyParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           cudaGraphNode_t *, size_t,
-                                           const struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemcpyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pCopyParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeGetParams(
-    cudaGraphNode_t node, struct cudaMemcpy3DParms *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParams(
-    cudaGraphNode_t node, const struct cudaMemcpy3DParms *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaMemcpy3DParms *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddMemsetNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-                       cudaGraphNode_t *pDependencies, size_t numDependencies,
-                       const struct cudaMemsetParams *pMemsetParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           cudaGraphNode_t *, size_t,
-                                           const struct cudaMemsetParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemsetNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pMemsetParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeGetParams(
-    cudaGraphNode_t node, struct cudaMemsetParams *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaMemsetParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemsetNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeSetParams(
-    cudaGraphNode_t node, const struct cudaMemsetParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaMemsetParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemsetNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddHostNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-                     cudaGraphNode_t *pDependencies, size_t numDependencies,
-                     const struct cudaHostNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           cudaGraphNode_t *, size_t,
-                                           const struct cudaHostNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddHostNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeGetParams(
-    cudaGraphNode_t node, struct cudaHostNodeParams *pNodeParams) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaHostNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphHostNodeGetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeSetParams(
-    cudaGraphNode_t node, const struct cudaHostNodeParams *pNodeParams) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
-                                           const struct cudaHostNodeParams *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphHostNodeSetParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pNodeParams);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddChildGraphNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-                           cudaGraphNode_t *pDependencies,
-                           size_t numDependencies, cudaGraph_t childGraph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(
-      cudaGraphNode_t *, cudaGraph_t, cudaGraphNode_t *, size_t, cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddChildGraphNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
-                  childGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphChildGraphNodeGetGraph(cudaGraphNode_t node, cudaGraph_t *pGraph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraph_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphChildGraphNodeGetGraph");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddEmptyNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
-                      cudaGraphNode_t *pDependencies, size_t numDependencies) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
-                                           cudaGraphNode_t *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddEmptyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphNode, graph, pDependencies, numDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphClone(cudaGraph_t *pGraphClone, cudaGraph_t originalGraph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t *, cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphClone");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphClone, originalGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphNodeFindInClone(cudaGraphNode_t *pNode, cudaGraphNode_t originalNode,
-                         cudaGraph_t clonedGraph) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraphNode_t, cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeFindInClone");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pNode, originalNode, clonedGraph);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphNodeGetType(cudaGraphNode_t node, enum cudaGraphNodeType *pType) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, enum cudaGraphNodeType *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pType);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphGetNodes(cudaGraph_t graph,
-                                                        cudaGraphNode_t *nodes,
-                                                        size_t *numNodes) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, nodes, numNodes);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphGetRootNodes(
-    cudaGraph_t graph, cudaGraphNode_t *pRootNodes, size_t *pNumRootNodes) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetRootNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, pRootNodes, pNumRootNodes);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphGetEdges(cudaGraph_t graph,
-                                                        cudaGraphNode_t *from,
-                                                        cudaGraphNode_t *to,
-                                                        size_t *numEdges) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *,
-                                           cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetEdges");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, from, to, numEdges);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependencies(
-    cudaGraphNode_t node, cudaGraphNode_t *pDependencies,
-    size_t *pNumDependencies) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pDependencies, pNumDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependentNodes(
-    cudaGraphNode_t node, cudaGraphNode_t *pDependentNodes,
-    size_t *pNumDependentNodes) {
-  using FuncPtr =
-      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetDependentNodes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node, pDependentNodes, pNumDependentNodes);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphAddDependencies(cudaGraph_t graph, cudaGraphNode_t *from,
-                         cudaGraphNode_t *to, size_t numDependencies) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *,
-                                           cudaGraphNode_t *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, from, to, numDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphRemoveDependencies(cudaGraph_t graph, cudaGraphNode_t *from,
-                            cudaGraphNode_t *to, size_t numDependencies) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *,
-                                           cudaGraphNode_t *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphRemoveDependencies");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph, from, to, numDependencies);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphDestroyNode(cudaGraphNode_t node) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDestroyNode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(node);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphInstantiate(
-    cudaGraphExec_t *pGraphExec, cudaGraph_t graph, cudaGraphNode_t *pErrorNode,
-    char *pLogBuffer, size_t bufferSize) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t *, cudaGraph_t,
-                                           cudaGraphNode_t *, char *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphInstantiate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(pGraphExec, graph, pErrorNode, pLogBuffer, bufferSize);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphLaunch(cudaGraphExec_t graphExec,
-                                                      cudaStream_t stream) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphLaunch");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graphExec, stream);
-}
-
-extern __host__ cudaError_t CUDARTAPI
-cudaGraphExecDestroy(cudaGraphExec_t graphExec) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graphExec);
-}
-
-extern __host__ cudaError_t CUDARTAPI cudaGraphDestroy(cudaGraph_t graph) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(graph);
-}
-
 extern __host__ cudaError_t CUDARTAPI cudaGetExportTable(
     const void **ppExportTable, const cudaUUID_t *pExportTableId) {
   using FuncPtr = cudaError_t(CUDARTAPI *)(const void **, const cudaUUID_t *);
diff --git a/tensorflow/stream_executor/cuda/cuda_stub.cc b/tensorflow/stream_executor/cuda/cuda_stub.cc
index 1219e7d98bc..3248c9ddefd 100644
--- a/tensorflow/stream_executor/cuda/cuda_stub.cc
+++ b/tensorflow/stream_executor/cuda/cuda_stub.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cuda.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/platform/dso_loader.h"
 
diff --git a/tensorflow/stream_executor/cuda/cudart_stub.cc b/tensorflow/stream_executor/cuda/cudart_stub.cc
index e14fb70c05f..acdf34e373f 100644
--- a/tensorflow/stream_executor/cuda/cudart_stub.cc
+++ b/tensorflow/stream_executor/cuda/cudart_stub.cc
@@ -16,7 +16,7 @@ limitations under the License.
 // This file wraps cuda runtime calls with dso loader so that we don't need to
 // have explicit linking to libcuda.
 
-#include "cuda/include/cuda_runtime_api.h"
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/platform/dso_loader.h"
 
@@ -47,9 +47,9 @@ cudaError_t GetSymbolNotFoundError() {
 #define __CUDA_DEPRECATED
 
 // A bunch of new symbols were introduced in version 10
-#if CUDA_VERSION <= 9020
+#if CUDART_VERSION <= 9020
 #include "tensorflow/stream_executor/cuda/cuda_runtime_9_0.inc"
-#elif CUDA_VERSION < 10010
+#elif CUDART_VERSION < 10010
 #include "tensorflow/stream_executor/cuda/cuda_runtime_10_0.inc"
 #else
 #include "tensorflow/stream_executor/cuda/cuda_runtime_10_1.inc"
@@ -121,10 +121,10 @@ extern __host__ __device__ unsigned CUDARTAPI __cudaPushCallConfiguration(
   return func_ptr(gridDim, blockDim, sharedMem, stream);
 }
 
-#if CUDA_VERSION >= 10010
-extern void CUDARTAPI __cudaUnregisterFatBinaryEnd(void **fatCubinHandle) {
+#if CUDART_VERSION >= 10010
+extern void CUDARTAPI __cudaRegisterFatBinaryEnd(void **fatCubinHandle) {
   using FuncPtr = void(CUDARTAPI *)(void **fatCubinHandle);
-  static auto func_ptr = LoadSymbol<FuncPtr>("__cudaUnregisterFatBinaryEnd");
+  static auto func_ptr = LoadSymbol<FuncPtr>("__cudaRegisterFatBinaryEnd");
   if (!func_ptr) return;
   func_ptr(fatCubinHandle);
 }
diff --git a/tensorflow/stream_executor/cuda/cudnn_stub.cc b/tensorflow/stream_executor/cuda/cudnn_stub.cc
index 2727c215e8c..3b567c15c6c 100644
--- a/tensorflow/stream_executor/cuda/cudnn_stub.cc
+++ b/tensorflow/stream_executor/cuda/cudnn_stub.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "cuda/include/cudnn.h"
+#include "third_party/gpus/cudnn/cudnn.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/platform/dso_loader.h"
 
diff --git a/tensorflow/stream_executor/cuda/cufft_stub.cc b/tensorflow/stream_executor/cuda/cufft_stub.cc
index c15d98730eb..68d7ec7634d 100644
--- a/tensorflow/stream_executor/cuda/cufft_stub.cc
+++ b/tensorflow/stream_executor/cuda/cufft_stub.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "cuda/include/cufft.h"
+#include "third_party/gpus/cuda/include/cufft.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/platform/dso_loader.h"
 
diff --git a/tensorflow/stream_executor/cuda/cupti_stub.cc b/tensorflow/stream_executor/cuda/cupti_stub.cc
index ef883f9bf98..0c7dd2e75f0 100644
--- a/tensorflow/stream_executor/cuda/cupti_stub.cc
+++ b/tensorflow/stream_executor/cuda/cupti_stub.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "cuda/extras/CUPTI/include/cupti.h"
+#include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
 // IWYU pragma: no_include "perftools/gputools/executor/stream_executor.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/platform/dso_loader.h"
diff --git a/tensorflow/stream_executor/cuda/curand_stub.cc b/tensorflow/stream_executor/cuda/curand_stub.cc
index dd70384d643..96eeee0fc9d 100644
--- a/tensorflow/stream_executor/cuda/curand_stub.cc
+++ b/tensorflow/stream_executor/cuda/curand_stub.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "cuda/include/curand.h"
+#include "third_party/gpus/cuda/include/curand.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/platform/dso_loader.h"
 
diff --git a/tensorflow/stream_executor/cuda/cusolver_stub.cc b/tensorflow/stream_executor/cuda/cusolver_stub.cc
index 664d10dcc64..f8d3df98e7e 100644
--- a/tensorflow/stream_executor/cuda/cusolver_stub.cc
+++ b/tensorflow/stream_executor/cuda/cusolver_stub.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "cuda/include/cusolverDn.h"
+#include "third_party/gpus/cuda/include/cusolverDn.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/platform/dso_loader.h"
 
diff --git a/tensorflow/stream_executor/cuda/cusparse_stub.cc b/tensorflow/stream_executor/cuda/cusparse_stub.cc
index 20ea7a7a85c..439de5eb83a 100644
--- a/tensorflow/stream_executor/cuda/cusparse_stub.cc
+++ b/tensorflow/stream_executor/cuda/cusparse_stub.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "cuda/include/cusparse.h"
+#include "third_party/gpus/cuda/include/cusparse.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/platform/dso_loader.h"
 
diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.cc b/tensorflow/stream_executor/device_memory_allocator.cc
similarity index 57%
rename from tensorflow/compiler/xla/service/device_memory_allocator.cc
rename to tensorflow/stream_executor/device_memory_allocator.cc
index e1e3b156fb3..e925b7be2ee 100644
--- a/tensorflow/compiler/xla/service/device_memory_allocator.cc
+++ b/tensorflow/stream_executor/device_memory_allocator.cc
@@ -13,30 +13,31 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
 #include <string>
 
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/util.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 
-namespace xla {
+namespace stream_executor {
 
 StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator(
-    const se::Platform* platform,
-    absl::Span<se::StreamExecutor* const> stream_executors)
+    const Platform* platform,
+    absl::Span<StreamExecutor* const> stream_executors)
     : DeviceMemoryAllocator(platform),
       stream_executors_(stream_executors.begin(), stream_executors.end()) {}
 
-StatusOr<OwningDeviceMemory> StreamExecutorMemoryAllocator::Allocate(
+port::StatusOr<OwningDeviceMemory> StreamExecutorMemoryAllocator::Allocate(
     int device_ordinal, uint64 size, bool retry_on_failure) {
-  TF_ASSIGN_OR_RETURN(se::StreamExecutor * stream_executor,
-                      GetStreamExecutor(device_ordinal));
-  se::DeviceMemoryBase result = stream_executor->AllocateArray<uint8>(size);
+  port::StatusOr<StreamExecutor*> stream_executor_or =
+      GetStreamExecutor(device_ordinal);
+  TF_RETURN_IF_ERROR(stream_executor_or.status());
+  DeviceMemoryBase result =
+      stream_executor_or.ValueOrDie()->AllocateArray<uint8>(size);
   if (size > 0 && result == nullptr) {
-    return ResourceExhausted(
+    return tensorflow::errors::ResourceExhausted(
         "Failed to allocate request for %s (%uB) on device ordinal %d",
         tensorflow::strings::HumanReadableNumBytes(size), size, device_ordinal);
   }
@@ -47,32 +48,34 @@ StatusOr<OwningDeviceMemory> StreamExecutorMemoryAllocator::Allocate(
   return OwningDeviceMemory(result, device_ordinal, this);
 }
 
-Status StreamExecutorMemoryAllocator::Deallocate(int device_ordinal,
-                                                 se::DeviceMemoryBase mem) {
+port::Status StreamExecutorMemoryAllocator::Deallocate(int device_ordinal,
+                                                       DeviceMemoryBase mem) {
   if (!mem.is_null()) {
-    TF_ASSIGN_OR_RETURN(se::StreamExecutor * stream_executor,
-                        GetStreamExecutor(device_ordinal));
+    port::StatusOr<StreamExecutor*> stream_executor_or =
+        GetStreamExecutor(device_ordinal);
+    TF_RETURN_IF_ERROR(stream_executor_or.status());
     VLOG(3) << absl::StreamFormat("Freeing %p on device ordinal %d",
                                   mem.opaque(), device_ordinal);
-    stream_executor->Deallocate(&mem);
+    stream_executor_or.ValueOrDie()->Deallocate(&mem);
   }
-  return Status::OK();
+  return port::Status::OK();
 }
 
-StatusOr<se::StreamExecutor*> StreamExecutorMemoryAllocator::GetStreamExecutor(
-    int device_ordinal) {
+port::StatusOr<StreamExecutor*>
+StreamExecutorMemoryAllocator::GetStreamExecutor(int device_ordinal) {
   if (device_ordinal < 0) {
-    return InvalidArgument("device ordinal value (%d) must be non-negative",
-                           device_ordinal);
+    return tensorflow::errors::InvalidArgument(
+        "device ordinal value (%d) must be non-negative", device_ordinal);
   }
   if (device_ordinal >= stream_executors_.size()) {
-    return InvalidArgument(
+    return tensorflow::errors::InvalidArgument(
         "device ordinal value (%d) >= number of devices (%u)", device_ordinal,
         stream_executors_.size());
   }
   if (stream_executors_[device_ordinal] == nullptr) {
-    return NotFound("Device %s:%d present but not supported",
-                    platform()->Name(), device_ordinal);
+    return tensorflow::errors::NotFound(
+        absl::StrFormat("Device %s:%d present but not supported",
+                        platform()->Name(), device_ordinal));
   }
   return stream_executors_[device_ordinal];
 }
@@ -81,4 +84,4 @@ bool StreamExecutorMemoryAllocator::AllowsAsynchronousDeallocation() const {
   return false;
 }
 
-}  // namespace xla
+}  // namespace stream_executor
diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.h b/tensorflow/stream_executor/device_memory_allocator.h
similarity index 74%
rename from tensorflow/compiler/xla/service/device_memory_allocator.h
rename to tensorflow/stream_executor/device_memory_allocator.h
index a2308ee7a41..0d911e25e3a 100644
--- a/tensorflow/compiler/xla/service/device_memory_allocator.h
+++ b/tensorflow/stream_executor/device_memory_allocator.h
@@ -19,13 +19,13 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/span.h"
-#include "tensorflow/compiler/xla/service/owning_device_memory.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/owning_device_memory.h"
+#include "tensorflow/stream_executor/platform.h"
 
-namespace xla {
+namespace stream_executor {
 
 // Interface for device memory allocators used within the XLA service. An
 // allocator is responsible for allocating memory on all devices of a particular
@@ -34,7 +34,7 @@ class DeviceMemoryAllocator {
  public:
   // Parameter platform indicates which platform the allocator allocates memory
   // on. Must be non-null.
-  explicit DeviceMemoryAllocator(const se::Platform* platform)
+  explicit DeviceMemoryAllocator(const Platform* platform)
       : platform_(platform) {}
   virtual ~DeviceMemoryAllocator() {}
 
@@ -47,23 +47,23 @@ class DeviceMemoryAllocator {
   // fails, the allocation should return immediately without retrying.  An
   // example use case is optional scratch spaces where a failure has only
   // performance impact.
-  virtual StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
-                                                bool retry_on_failure) = 0;
+  virtual port::StatusOr<OwningDeviceMemory> Allocate(
+      int device_ordinal, uint64 size, bool retry_on_failure) = 0;
 
   // Two-arg version of Allocate(), which sets retry-on-failure to true.
   //
   // (We don't simply use a default argument on the virtual Allocate function
   // because default args on virtual functions are disallowed by the Google
   // style guide.)
-  StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size) {
+  port::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size) {
     return Allocate(device_ordinal, size, /*retry_on_failure=*/true);
   }
 
   // Must be a nop for null pointers.
-  virtual Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) = 0;
+  virtual port::Status Deallocate(int device_ordinal, DeviceMemoryBase mem) = 0;
 
   // Return the platform that the allocator allocates memory on.
-  const se::Platform* platform() const { return platform_; }
+  const Platform* platform() const { return platform_; }
 
   // Can we call Deallocate() as soon as a computation has been scheduled on
   // a stream, or do we have to wait for the computation to complete first?
@@ -71,7 +71,7 @@ class DeviceMemoryAllocator {
 
  protected:
   friend class OwningDeviceMemory;
-  const se::Platform* platform_;
+  const Platform* platform_;
 };
 
 // Default memory allocator for a platform which uses
@@ -79,28 +79,28 @@ class DeviceMemoryAllocator {
 class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator {
  public:
   StreamExecutorMemoryAllocator(
-      const se::Platform* platform,
-      absl::Span<se::StreamExecutor* const> stream_executors);
+      const Platform* platform,
+      absl::Span<StreamExecutor* const> stream_executors);
 
-  StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
-                                        bool retry_on_failure) override;
+  port::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
+                                              bool retry_on_failure) override;
 
   // Pull in two-arg overload that sets retry_on_failure to true.
   using DeviceMemoryAllocator::Allocate;
 
-  Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override;
+  port::Status Deallocate(int device_ordinal, DeviceMemoryBase mem) override;
 
   bool AllowsAsynchronousDeallocation() const override;
 
  private:
-  StatusOr<se::StreamExecutor*> GetStreamExecutor(int device_ordinal);
+  port::StatusOr<StreamExecutor*> GetStreamExecutor(int device_ordinal);
 
   // A vector indexed by device ordinal of StreamExecutors for each device of
   // the allocator's platform type. If an element is nullptr, then the device
   // with the respective device ordinal is not supported by XLA.
-  std::vector<se::StreamExecutor*> stream_executors_;
+  std::vector<StreamExecutor*> stream_executors_;
 };
 
-}  // namespace xla
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DEVICE_MEMORY_ALLOCATOR_H_
diff --git a/tensorflow/stream_executor/dnn.proto b/tensorflow/stream_executor/dnn.proto
index 188137b4a40..fb6bda9d317 100644
--- a/tensorflow/stream_executor/dnn.proto
+++ b/tensorflow/stream_executor/dnn.proto
@@ -71,6 +71,7 @@ enum ConvolutionKind {
   FORWARD = 1;
   BACKWARD_FILTER = 2;
   BACKWARD_DATA = 3;
+  FORWARD_BIAS_ACTIVATION = 4;
 }
 
 // Generic tensor representation.
diff --git a/tensorflow/stream_executor/gpu/BUILD b/tensorflow/stream_executor/gpu/BUILD
index 2dd21f49ff1..1bbee8f52a7 100644
--- a/tensorflow/stream_executor/gpu/BUILD
+++ b/tensorflow/stream_executor/gpu/BUILD
@@ -7,11 +7,17 @@ load(
     "//tensorflow/stream_executor:build_defs.bzl",
     "if_gpu_is_configured",
 )
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
+load(
+    "//tensorflow/core:platform/default/cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
 
 package(
-    default_visibility = ["//tensorflow/stream_executor:__subpackages__"],
+    default_visibility = [
+        "//tensorflow/compiler/xla/service/gpu:__subpackages__",
+        "//tensorflow/stream_executor:__subpackages__",
+    ],
 )
 
 # Filegroup used to collect source files for the dependency check.
diff --git a/tensorflow/stream_executor/gpu/gpu_driver.h b/tensorflow/stream_executor/gpu/gpu_driver.h
index 73cc24f86fc..07b35192f03 100644
--- a/tensorflow/stream_executor/gpu/gpu_driver.h
+++ b/tensorflow/stream_executor/gpu/gpu_driver.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include <stddef.h>
 #include "tensorflow/stream_executor/platform/port.h"
 
-#include "cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cuda.h"
 #include "tensorflow/stream_executor/device_options.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
diff --git a/tensorflow/stream_executor/gpu/gpu_types.h b/tensorflow/stream_executor/gpu/gpu_types.h
index 64a6e5e5efc..c48a4228b7a 100644
--- a/tensorflow/stream_executor/gpu/gpu_types.h
+++ b/tensorflow/stream_executor/gpu/gpu_types.h
@@ -28,8 +28,8 @@ limitations under the License.
 
 #else  // CUDA
 
-#include "cuda/include/cuComplex.h"
-#include "cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cuComplex.h"
+#include "third_party/gpus/cuda/include/cuda.h"
 
 // cannot include curand.h here
 //   because it triggers the #error in cuda/cuda_gpu_executor.cc
diff --git a/tensorflow/stream_executor/host/BUILD b/tensorflow/stream_executor/host/BUILD
index 00fabe5772f..6ad06bb9bb9 100644
--- a/tensorflow/stream_executor/host/BUILD
+++ b/tensorflow/stream_executor/host/BUILD
@@ -67,6 +67,7 @@ cc_library(
         "host_stream.h",
     ],
     deps = [
+        "//tensorflow/core:lib_internal",
         "//tensorflow/stream_executor:kernel",
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/synchronization",
diff --git a/tensorflow/stream_executor/host/host_stream.cc b/tensorflow/stream_executor/host/host_stream.cc
index 0d8cb46f196..413edc6739a 100644
--- a/tensorflow/stream_executor/host/host_stream.cc
+++ b/tensorflow/stream_executor/host/host_stream.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include "tensorflow/stream_executor/host/host_stream.h"
 
 #include "absl/synchronization/notification.h"
+#include "tensorflow/core/platform/denormal.h"
+#include "tensorflow/core/platform/setround.h"
 
 namespace stream_executor {
 namespace host {
@@ -45,6 +47,11 @@ bool HostStream::EnqueueTask(std::function<void()> fn) {
 bool HostStream::WorkAvailable() { return !work_queue_.empty(); }
 
 void HostStream::WorkLoop() {
+  // Set denormal and rounding behavior to match the default TF ThreadPool
+  // behavior.
+  // TODO(phawkins, jlebar): it's not clear this is the best place to set this.
+  tensorflow::port::ScopedFlushDenormal flush;
+  tensorflow::port::ScopedSetRound round(FE_TONEAREST);
   while (true) {
     std::function<void()> fn;
     {
diff --git a/tensorflow/compiler/xla/service/owning_device_memory.cc b/tensorflow/stream_executor/owning_device_memory.cc
similarity index 82%
rename from tensorflow/compiler/xla/service/owning_device_memory.cc
rename to tensorflow/stream_executor/owning_device_memory.cc
index c115bc097f3..8b92ccfef10 100644
--- a/tensorflow/compiler/xla/service/owning_device_memory.cc
+++ b/tensorflow/stream_executor/owning_device_memory.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/owning_device_memory.h"
+#include "tensorflow/stream_executor/owning_device_memory.h"
 
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 
-namespace xla {
+namespace stream_executor {
 
 void OwningDeviceMemory::Free() {
   CHECK(allocator_ != nullptr)
@@ -29,7 +29,7 @@ void OwningDeviceMemory::Free() {
   }
 
   allocator_ = nullptr;
-  mem_ = se::DeviceMemoryBase();
+  mem_ = DeviceMemoryBase();
 }
 
-}  // namespace xla
+}  // namespace stream_executor
diff --git a/tensorflow/compiler/xla/service/owning_device_memory.h b/tensorflow/stream_executor/owning_device_memory.h
similarity index 88%
rename from tensorflow/compiler/xla/service/owning_device_memory.h
rename to tensorflow/stream_executor/owning_device_memory.h
index 4be9bd80477..46946c4acf6 100644
--- a/tensorflow/compiler/xla/service/owning_device_memory.h
+++ b/tensorflow/stream_executor/owning_device_memory.h
@@ -16,12 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_OWNING_DEVICE_MEMORY_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_OWNING_DEVICE_MEMORY_H_
 
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-namespace xla {
+namespace stream_executor {
 
 // Break circular dependency between this file and device_memory_allocator.h.
 class DeviceMemoryAllocator;
@@ -43,7 +41,7 @@ class OwningDeviceMemory {
  public:
   OwningDeviceMemory() : device_ordinal_(-1), allocator_(nullptr) {}
 
-  explicit OwningDeviceMemory(se::DeviceMemoryBase mem, int device_ordinal,
+  explicit OwningDeviceMemory(DeviceMemoryBase mem, int device_ordinal,
                               DeviceMemoryAllocator* allocator)
       : mem_(mem), device_ordinal_(device_ordinal), allocator_(allocator) {
     CHECK(allocator != nullptr) << "allocator cannot be null.";
@@ -53,7 +51,7 @@ class OwningDeviceMemory {
       : mem_(other.mem_),
         device_ordinal_(other.device_ordinal_),
         allocator_(other.allocator_) {
-    other.mem_ = se::DeviceMemoryBase();
+    other.mem_ = DeviceMemoryBase();
     other.allocator_ = nullptr;
   }
 
@@ -65,7 +63,7 @@ class OwningDeviceMemory {
     device_ordinal_ = other.device_ordinal_;
     allocator_ = other.allocator_;
 
-    other.mem_ = se::DeviceMemoryBase();
+    other.mem_ = DeviceMemoryBase();
     other.allocator_ = nullptr;
     return *this;
   }
@@ -100,25 +98,25 @@ class OwningDeviceMemory {
   // !is_null() is sufficient but not necessary to imply `this` is active.
   bool is_null() const { return mem_.is_null(); }
 
-  se::DeviceMemoryBase AsDeviceMemoryBase() const {
+  DeviceMemoryBase AsDeviceMemoryBase() const {
     // This const_cast is necessary because DeviceMemoryBase's constructor
     // doesn't accept a const void*.  This isn't ideal, but it's better than the
     // alternative of making a AsDeviceMemoryBase non-const member function.
     //
     // This is safe (i.e. not UB) because the casted pointer is derived from a
     // non-const pointer, namely mem_.opaque().
-    return se::DeviceMemoryBase(const_cast<void*>(opaque()), size());
+    return DeviceMemoryBase(const_cast<void*>(opaque()), size());
   }
 
   // Returns the wrapped DeviceMemoryBase without freeing it, and deactivates
   // this object.  Precondition: `this` is active.
-  TF_MUST_USE_RESULT se::DeviceMemoryBase Forget() {
+  TF_MUST_USE_RESULT DeviceMemoryBase Forget() {
     CHECK(allocator_ != nullptr)
         << "Can't call Forget() on an inactive (i.e. moved from, Forget()'ten, "
            "or Free()'ed) instance.";
     allocator_ = nullptr;
-    se::DeviceMemoryBase mem(mem_);
-    mem_ = se::DeviceMemoryBase();
+    DeviceMemoryBase mem(mem_);
+    mem_ = DeviceMemoryBase();
     return mem;
   }
 
@@ -127,11 +125,11 @@ class OwningDeviceMemory {
   void Free();
 
  private:
-  se::DeviceMemoryBase mem_;
+  DeviceMemoryBase mem_;
   int device_ordinal_;
   DeviceMemoryAllocator* allocator_;  // Null if this object is inactive.
 };
 
-}  // namespace xla
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_OWNING_DEVICE_MEMORY_H_
diff --git a/tensorflow/stream_executor/platform/default/dso_loader.cc b/tensorflow/stream_executor/platform/default/dso_loader.cc
index 80d71e25d48..9ceaa296dcb 100644
--- a/tensorflow/stream_executor/platform/default/dso_loader.cc
+++ b/tensorflow/stream_executor/platform/default/dso_loader.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
-#include "cuda/cuda_config.h"
+#include "third_party/gpus/cuda/cuda_config.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/path.h"
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 3f4bcab13c8..0feaa5687c4 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -17,11 +17,14 @@ load(
     "@local_config_tensorrt//:build_defs.bzl",
     "if_tensorrt",
 )
+load(
+    "//tensorflow/core:platform/default/cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
 load(
     "@local_config_cuda//cuda:build_defs.bzl",
     "cuda_default_copts",
     "if_cuda",
-    "if_cuda_is_configured",
 )
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
@@ -69,8 +72,6 @@ def if_not_v2(a):
         "//conditions:default": a,
     })
 
-# if_cuda_is_configured def placeholder
-
 def if_cuda_is_configured_compat(x):
     return if_cuda_is_configured(x)
 
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
index 8a9b8bbea6e..da5b5e089f8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
@@ -50,6 +50,12 @@ tf_proto {
       label: LABEL_OPTIONAL
       type: TYPE_BOOL
     }
+    field {
+      name: "share_cluster_devices_in_session"
+      number: 10
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
     reserved_range {
       start: 2
       end: 3
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
index 4e3960ce346..a961fcaf9a9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
@@ -173,6 +173,12 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_BOOL
       }
+      field {
+        name: "share_cluster_devices_in_session"
+        number: 10
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
       reserved_range {
         start: 2
         end: 3
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
index 57b7f79dd32..09ec3e7acfa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
@@ -58,7 +58,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'initial_value\', \'trainable\', \'collections\', \'validate_shape\', \'caching_device\', \'name\', \'variable_def\', \'dtype\', \'expected_shape\', \'import_scope\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'initial_value\', \'trainable\', \'collections\', \'validate_shape\', \'caching_device\', \'name\', \'variable_def\', \'dtype\', \'expected_shape\', \'import_scope\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'shape\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "assign"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
index 7e2c4511094..94ffbca003f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
index a37b66b8f0b..0ed2d44e551 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -5,6 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
index e0d3510245e..a40c032e9a4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
@@ -15,6 +15,10 @@ tf_class {
     name: "experimental_optimization"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "experimental_slack"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "experimental_stats"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
index 2b0e113d4e5..60f7e1f4c72 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -5,6 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
index 21d463bfaef..d335061158d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
@@ -5,6 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
index 136c48d98bb..39431952268 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -5,6 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-distribute-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-distribute-options.pbtxt
index 828719b2f35..5909fc7db0c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-distribute-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-distribute-options.pbtxt
@@ -7,6 +7,10 @@ tf_class {
     name: "auto_shard"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "num_devices"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
index 1d6d71e0825..6221aaa0b0d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -5,6 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
index 33d845e281b..d1903301787 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -5,6 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
index e4f1c335fa1..ca306922553 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
@@ -62,7 +62,7 @@ tf_module {
   }
   member_method {
     name: "decode_image"
-    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\'], "
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\', \'expand_animations\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\', \'True\'], "
   }
   member_method {
     name: "decode_jpeg"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
index ccf7b48374a..73f765f395f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
@@ -74,7 +74,7 @@ tf_module {
   }
   member_method {
     name: "decode_image"
-    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\'], "
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\', \'expand_animations\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\', \'True\'], "
   }
   member_method {
     name: "decode_jpeg"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
index 7de68221c0a..ab09474a2bb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
@@ -103,6 +103,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
index 85609735f24..9d9afe34685 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
@@ -104,6 +104,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index 73b6eebcff7..af0da4dc907 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
index 11081edf49b..78d5475dd96 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
index 2512c30c60d..416a309f0e6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
@@ -86,6 +86,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
index 26187f3e71c..f277bfb3918 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
index ed43cf371d5..e880978b82d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
index 8b098325aa6..407eb2f116e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt
index 975917d23aa..d429857694d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
index b9abcb49fd7..3e1801ab1d5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt
index d72f31fe198..52d44886118 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index 3f4d9a6eb57..3730402351a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index a1666f92c49..e47e21ec17c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index 7e71d7b1d1c..2ffc509c835 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
index 2b54b0ac31d..e993e453544 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index 2392a96ae2a..4c27bb2786f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index 8747160a58d..b1148dcd844 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 27f8b336ba4..55ab4e55e23 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
index 36e7ed960da..89b5f4f25e7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
index 6d27d9216bc..d29e7cf3720 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -83,6 +83,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
index f061166e987..aa1a76372af 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index 792573a2708..2f618c28e2f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -160,6 +160,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
index 0caff0d44eb..c3d60603fe2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index 0cf63763c3a..134140dfad3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
index 137afe4d320..d4fca491218 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index b973c198ba3..195e1055687 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
index f3c86ec4ee7..037105c5d10 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
index d19703c3234..53fa4329db3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 4c6a4fab438..73b0c70049a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
index 6247a8389e8..064ddf82713 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 284b3855ead..80eb98b58b8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
index 8135ced769c..6f4126f76ef 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
index 01037abdffa..703838b594e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
index bbf91ca4b07..32391693f29 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
index ef48b2b7515..6545f772ac7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
index 4822a699fcc..969fd8cde35 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
index ba9c428f719..1aa8523c3a7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
index de0b35f53b4..55e87804620 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
index c9dfb597c50..205652a6c4f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index 32cba1888bf..9435078e6b8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
index afaf6483d18..e6f09477dad 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
index 59b04042b18..bd71d50f803 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
index f284946135e..1ace5ecc95a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
index 3190565934b..1407e657dfd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
index 299169f60c0..72a89d180de 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index 532e2869b1f..67defe65afe 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
index c2769490726..8e288cba394 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -143,6 +143,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index fb7431fdf97..7b9961570fd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 9bad46e1264..26a591459a1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index 1e9030ded6c..773d98fb890 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index d7148b022ba..318d3bed221 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index a8350afc850..089955308ca 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index a0672991052..c3c0c038983 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 0d0ad663612..55c58704e58 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index 45438bd1b81..30fcd8ee76a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index 0bf8efa7ca9..6f3dc3faa6e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index b4446b179ad..18197935889 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index ccd2ee40630..dd9cb0e1a85 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index f4aad9e14b9..380c6a43db5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index 6a58f7b2e8d..fca2eb2a6e2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index 1a76d09ca21..59b2d6da423 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
index b64ca63223d..6f2277f015e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 544c68df527..58228d3415f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 446c51c7056..e8c3cf3537f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -139,6 +139,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
index 2dc143c824d..9c388afb097 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer-normalization.pbtxt
index e4d70a6b85a..8e149c48f0c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer-normalization.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
index 23ce7141928..097a4c75fa5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
@@ -77,6 +77,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index 8bca987ebc2..4d0998450c5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index fcf0b9ccb05..292e8218f01 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index 6491126e3b0..465cc1bf9e0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
index 4535ddcacc2..9eba02262a7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
index b1ca63566e6..08636ccc843 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
index d47303ae24f..377c7ceb0ad 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
index ba14ef382bd..43c3b4dab7c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index d30dc3b15c8..54debffbd57 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index d3307973d41..6733e4a906b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index 9db6af6c65f..c7ed48d5c60 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
index cec1596cf84..8a243220802 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
index f89e27f87e5..2a9c04b1a3f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
index e1c450b4f2d..8605b7a7c84 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
index 4583764ba7c..31668c596ef 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
index 0d9553fed18..244f1565f44 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
index c5790b3f14d..e2587275d3c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
index e4aae27f504..7d659b5e9eb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
index bd6548f0949..02ae681d5d7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
index 731341b9189..cf08b25c298 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index db493c5ca72..84bd0dfd24a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index 26b96ab11fe..b87fb3f673b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index ade9dbfcc17..2b8aba89f65 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index 3b0cd02a762..6162b9fe9d9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index f7952679514..e6ce21c452b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 3be83609dfd..f6ccb281e2b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -131,6 +131,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
index 21169cfa3d0..9a2c62a3c14 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index 790682e29ce..23f29f7b177 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index 400159f0e14..6d6bb8728a7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 41c02d5edc2..3d2ca036ce6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 1407d9589f2..61276680432 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -86,6 +86,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
index 5679b6390d7..5ea0e253193 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index ea7cb15afe2..a0457e08a48 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
index d0bf532e5af..85042312e11 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index b5e994ebabb..5edc9f5ef9c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index a30d438f028..2282e9afc9f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 50152d77835..425e736fff7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
index d97e4271822..acfe1d4db05 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index a7e49b85ca4..e8854a00abe 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index cbf69a5beab..e76f6d6b5c2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 0bcba0face1..0a6cb86d3a8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt
index 840f019720c..91dcbd85439 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
index 9270466ff08..24854eda5ef 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
index 1cbaa41f6c0..1cb9a7ff528 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
index 84f6159502f..ae0cc85a174 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
index 819d56c59b2..20567bac7f3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
index ddcaaab9baa..4512fc87819 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt
index 61e5f311273..8246b68f73b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt
index 9bb4fac45a6..0afbb70c300 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
index 6d109a47e21..29690d0f666 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
index 28e685671b5..a1fbded163a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt
index 3e7651b8546..6a5fe85c29e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt
index a683124dd49..965098d8d2a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt
index d399050b115..49a620dad70 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
index 10f3aaac62a..8ac094372bd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
index 2cc27e7c1f1..3dd2cfce2f0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt
index 545fb62e75e..350bf484786 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt
index 0f2c4ac1ae0..d96d5030ad3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt
@@ -81,6 +81,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt
index 05af94cfed3..41b26105d04 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
index bffb2ab0d2e..b72ed244434 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt
index 841952707b4..3a82535552a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt
@@ -87,6 +87,10 @@ tf_class {
     name: "total"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
index 021df9e0653..22a91dcedf6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt
index 2486468aca5..ffe189f8e17 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt
index 1347230191c..2041c56a2a9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
index 307ce9c80fd..637f129f2bf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
index 1614323b2c4..5008b2eb22f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
index 4fd4e52b026..5f470754ce1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
@@ -81,6 +81,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
index 0b2be409d8d..c03826e608c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
index ddcbd8087e5..c3fe4ede39e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
index df3d6ef1da9..dac78639cf5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
index 23431e4781b..345cae9ab44 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
index 388267b4585..757db172181 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt
index 7c23fd5234c..76a5473fe38 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt
index d815fe56699..704ab64939e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
index 4a7edf5c529..4faa79da1ef 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
index c8bb4275a4e..17249aab1dc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
index e73842c1a23..8570935eed8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
index 56f85ae86dc..62949eacb34 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
@@ -103,6 +103,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
index 21018c657d8..ac70d55394a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
@@ -104,6 +104,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
index fe469b421bb..fe333669869 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
index 16b87f9182d..dccf58ef136 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
index aed4197a288..9c155b24a77 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
index dcca23e834b..5c8b40925f2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
index 6ee991e8c69..c30de1fff36 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
index 3e9f60e5cb4..76616740b1e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
index a09c8d8c25a..5e6b54c3310 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
index e86bf9fe032..3bba8acef75 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
index 12cc0f75156..75a44aaaba8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
index 85944f66a74..e8528443353 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
index 74a5c1cf9db..bddf79574a6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
index 92e196f8c85..f48d3abeeb2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
index 905b5c1e279..e712e362506 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
@@ -86,6 +86,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
index e551ec7ed64..9c9328ba60b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
index 8ff72d52e05..7b3ebab5724 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
index 64cad4740f7..77bc2d164f9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
index a653b84ad9f..ee90918bab0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
index 1f2dbf2d9be..a684efd3a79 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt
index f6a50068bd9..8e1d0540a6c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt
@@ -97,6 +97,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt
index 5d437bb6a13..612228a3838 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt
@@ -97,6 +97,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
index cfc6400ff5a..815c075336c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
@@ -97,6 +97,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
index 3ac302e7420..326c0a2b248 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
@@ -97,6 +97,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 9daccc53b0d..282c0899dae 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
index bd36c2f181d..ddc5943042e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
index f863d615afe..4bb92bc6acd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
@@ -97,6 +97,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
index ac9e102906e..8ad90e68253 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
@@ -97,6 +97,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
index ca9e2f7751f..c4a1b595396 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index 75c82ca8ffa..3a5513a4cd9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -95,6 +95,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index c118f5d9a29..32a31290b03 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index a5c4fb03e26..091cc04357e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -1252,6 +1252,10 @@ tf_module {
     name: "fill"
     argspec: "args=[\'dims\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "fingerprint"
+    argspec: "args=[\'data\', \'method\', \'name\'], varargs=None, keywords=None, defaults=[\'farmhash64\', \'None\'], "
+  }
   member_method {
     name: "fixed_size_partitioner"
     argspec: "args=[\'num_shards\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
@@ -2416,6 +2420,10 @@ tf_module {
     name: "variables_initializer"
     argspec: "args=[\'var_list\', \'name\'], varargs=None, keywords=None, defaults=[\'init\'], "
   }
+  member_method {
+    name: "vectorized_map"
+    argspec: "args=[\'fn\', \'elems\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "verify_tensor_all_finite"
     argspec: "args=[\'t\', \'msg\', \'name\', \'x\', \'message\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index d2e12dc7156..12e668952bc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -1332,6 +1332,10 @@ tf_module {
     name: "FilterDataset"
     argspec: "args=[\'input_dataset\', \'other_arguments\', \'predicate\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "Fingerprint"
+    argspec: "args=[\'data\', \'method\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "FixedLengthRecordDataset"
     argspec: "args=[\'filenames\', \'header_bytes\', \'record_bytes\', \'footer_bytes\', \'buffer_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -2390,7 +2394,7 @@ tf_module {
   }
   member_method {
     name: "PrefetchDataset"
-    argspec: "args=[\'input_dataset\', \'buffer_size\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input_dataset\', \'buffer_size\', \'output_types\', \'output_shapes\', \'slack_period\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
   }
   member_method {
     name: "Prelinearize"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-manager.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-manager.pbtxt
index 2538de661b3..d981983e938 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-manager.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-manager.pbtxt
@@ -12,7 +12,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'checkpoint\', \'directory\', \'max_to_keep\', \'keep_checkpoint_every_n_hours\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'checkpoint\', \'directory\', \'max_to_keep\', \'keep_checkpoint_every_n_hours\', \'checkpoint_name\'], varargs=None, keywords=None, defaults=[\'None\', \'ckpt\'], "
   }
   member_method {
     name: "save"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
index ff3edf88ec0..5ecfa493b56 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
@@ -57,7 +57,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'initial_value\', \'trainable\', \'validate_shape\', \'caching_device\', \'name\', \'variable_def\', \'dtype\', \'import_scope\', \'constraint\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'initial_value\', \'trainable\', \'validate_shape\', \'caching_device\', \'name\', \'variable_def\', \'dtype\', \'import_scope\', \'constraint\', \'synchronization\', \'aggregation\', \'shape\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "assign"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
index 64e0a2ae2e6..bb56967c18a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.data.Dataset"
 tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
index f20fb7323fc..597c5bce102 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -4,6 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
index e0d3510245e..a40c032e9a4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
@@ -15,6 +15,10 @@ tf_class {
     name: "experimental_optimization"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "experimental_slack"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "experimental_stats"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
index 5abd1002656..c24bac5bd95 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.readers.TFRecordDatasetV2\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
index 17527aee4fc..8946cecfc83 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
@@ -4,6 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
index 8292a645273..2365c62a61c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -4,6 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-distribute-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-distribute-options.pbtxt
index 828719b2f35..5909fc7db0c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-distribute-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-distribute-options.pbtxt
@@ -7,6 +7,10 @@ tf_class {
     name: "auto_shard"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "num_devices"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
index 56102634161..af008c6ad5b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -4,6 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
index 1c7fb500754..34370adc7da 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -4,6 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
index c8651f813be..f51c46b4462 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
@@ -62,7 +62,7 @@ tf_module {
   }
   member_method {
     name: "decode_image"
-    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\'], "
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\', \'expand_animations\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\', \'True\'], "
   }
   member_method {
     name: "decode_jpeg"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
index 8fee5ae5526..865f0f86134 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
@@ -54,7 +54,7 @@ tf_module {
   }
   member_method {
     name: "decode_image"
-    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\'], "
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\', \'expand_animations\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\', \'True\'], "
   }
   member_method {
     name: "decode_jpeg"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
index 7de68221c0a..ab09474a2bb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
@@ -103,6 +103,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
index 85609735f24..9d9afe34685 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
@@ -104,6 +104,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt
index 6a00e0a25d4..24385e2722a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'log_dir\', \'histogram_freq\', \'write_graph\', \'write_images\', \'update_freq\', \'profile_batch\'], varargs=None, keywords=kwargs, defaults=[\'logs\', \'0\', \'True\', \'False\', \'epoch\', \'2\'], "
+    argspec: "args=[\'self\', \'log_dir\', \'histogram_freq\', \'write_graph\', \'write_images\', \'update_freq\', \'profile_batch\', \'embeddings_freq\', \'embeddings_metadata\'], varargs=None, keywords=kwargs, defaults=[\'logs\', \'0\', \'True\', \'False\', \'epoch\', \'2\', \'0\', \'None\'], "
   }
   member_method {
     name: "on_batch_begin"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index 73b6eebcff7..af0da4dc907 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
index 11081edf49b..78d5475dd96 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
index 2512c30c60d..416a309f0e6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-abstract-r-n-n-cell.pbtxt
@@ -86,6 +86,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
index 26187f3e71c..f277bfb3918 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
index ed43cf371d5..e880978b82d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
index 8b098325aa6..407eb2f116e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt
index 975917d23aa..d429857694d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
index b9abcb49fd7..3e1801ab1d5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt
index d72f31fe198..52d44886118 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index 3f4d9a6eb57..3730402351a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index a1666f92c49..e47e21ec17c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index 7e71d7b1d1c..2ffc509c835 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
index 2b54b0ac31d..e993e453544 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index 2392a96ae2a..4c27bb2786f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index 8747160a58d..b1148dcd844 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 27f8b336ba4..55ab4e55e23 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
index 3727d4638f0..cbbfd50a3e9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
index 6d27d9216bc..d29e7cf3720 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -83,6 +83,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
index f061166e987..aa1a76372af 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index 792573a2708..2f618c28e2f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -160,6 +160,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
index 0caff0d44eb..c3d60603fe2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index 0cf63763c3a..134140dfad3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
index 137afe4d320..d4fca491218 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index b973c198ba3..195e1055687 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
index f3c86ec4ee7..037105c5d10 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
index d19703c3234..53fa4329db3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 4c6a4fab438..73b0c70049a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
index 6247a8389e8..064ddf82713 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 284b3855ead..80eb98b58b8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
index 8135ced769c..6f4126f76ef 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
index 01037abdffa..703838b594e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
index bbf91ca4b07..32391693f29 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
index ef48b2b7515..6545f772ac7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
index de0b35f53b4..55e87804620 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
index c9dfb597c50..205652a6c4f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index 32cba1888bf..9435078e6b8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
index afaf6483d18..e6f09477dad 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
index 59b04042b18..bd71d50f803 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
index f284946135e..1ace5ecc95a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
index 3190565934b..1407e657dfd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
index 299169f60c0..72a89d180de 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index 092fb5590d9..f7aa2fdce3d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
index 27f5ef1b9ad..019dd0454e2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -145,6 +145,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index fb7431fdf97..7b9961570fd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 9bad46e1264..26a591459a1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index 1e9030ded6c..773d98fb890 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index d7148b022ba..318d3bed221 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index a8350afc850..089955308ca 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index a0672991052..c3c0c038983 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 0d0ad663612..55c58704e58 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index 45438bd1b81..30fcd8ee76a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index 0bf8efa7ca9..6f3dc3faa6e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index b4446b179ad..18197935889 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index ccd2ee40630..dd9cb0e1a85 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index f4aad9e14b9..380c6a43db5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index 6a58f7b2e8d..fca2eb2a6e2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index 1a76d09ca21..59b2d6da423 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
index b64ca63223d..6f2277f015e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 70e06a65979..52b41d9b26c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
index ec9e19b03a2..f7851d96e8a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -141,6 +141,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
index 2dc143c824d..9c388afb097 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer-normalization.pbtxt
index e4d70a6b85a..8e149c48f0c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer-normalization.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
index 23ce7141928..097a4c75fa5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
@@ -77,6 +77,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index 8bca987ebc2..4d0998450c5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index fcf0b9ccb05..292e8218f01 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index 6491126e3b0..465cc1bf9e0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
index 4535ddcacc2..9eba02262a7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
index b1ca63566e6..08636ccc843 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
index d47303ae24f..377c7ceb0ad 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
index ba14ef382bd..43c3b4dab7c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index d30dc3b15c8..54debffbd57 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index d3307973d41..6733e4a906b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index 9db6af6c65f..c7ed48d5c60 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
index cec1596cf84..8a243220802 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
index f89e27f87e5..2a9c04b1a3f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
index e1c450b4f2d..8605b7a7c84 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
index 4583764ba7c..31668c596ef 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
index 0d9553fed18..244f1565f44 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
index c5790b3f14d..e2587275d3c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
index e4aae27f504..7d659b5e9eb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
index bd6548f0949..02ae681d5d7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
index 731341b9189..cf08b25c298 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index db493c5ca72..84bd0dfd24a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index 26b96ab11fe..b87fb3f673b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index ade9dbfcc17..2b8aba89f65 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index 3b0cd02a762..6162b9fe9d9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index f7952679514..e6ce21c452b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 3be83609dfd..f6ccb281e2b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -131,6 +131,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
index 21169cfa3d0..9a2c62a3c14 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index 790682e29ce..23f29f7b177 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index 400159f0e14..6d6bb8728a7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 41c02d5edc2..3d2ca036ce6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 1407d9589f2..61276680432 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -86,6 +86,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
index 5679b6390d7..5ea0e253193 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index ea7cb15afe2..a0457e08a48 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
index d0bf532e5af..85042312e11 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index b5e994ebabb..5edc9f5ef9c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index a30d438f028..2282e9afc9f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 50152d77835..425e736fff7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
index d97e4271822..acfe1d4db05 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index a7e49b85ca4..e8854a00abe 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index cbf69a5beab..e76f6d6b5c2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 0bcba0face1..0a6cb86d3a8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt
index 840f019720c..91dcbd85439 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
index 9270466ff08..24854eda5ef 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
index 1cbaa41f6c0..1cb9a7ff528 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
index 84f6159502f..ae0cc85a174 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
index 819d56c59b2..20567bac7f3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
index ddcaaab9baa..4512fc87819 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt
index 61e5f311273..8246b68f73b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt
index 9bb4fac45a6..0afbb70c300 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
index 6d109a47e21..29690d0f666 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
index 28e685671b5..a1fbded163a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt
index 3e7651b8546..6a5fe85c29e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt
index a683124dd49..965098d8d2a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt
index d399050b115..49a620dad70 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
index 10f3aaac62a..8ac094372bd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
index 2cc27e7c1f1..3dd2cfce2f0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt
index 545fb62e75e..350bf484786 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt
index 0f2c4ac1ae0..d96d5030ad3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt
@@ -81,6 +81,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt
index 05af94cfed3..41b26105d04 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
index bffb2ab0d2e..b72ed244434 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt
index 841952707b4..3a82535552a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt
@@ -87,6 +87,10 @@ tf_class {
     name: "total"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
index 021df9e0653..22a91dcedf6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt
index 2486468aca5..ffe189f8e17 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt
index 1347230191c..2041c56a2a9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
index 307ce9c80fd..637f129f2bf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
index 1614323b2c4..5008b2eb22f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
index 4fd4e52b026..5f470754ce1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
@@ -81,6 +81,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
index 0b2be409d8d..c03826e608c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
index ddcbd8087e5..c3fe4ede39e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
index df3d6ef1da9..dac78639cf5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
index 23431e4781b..345cae9ab44 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
index 388267b4585..757db172181 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt
index 7c23fd5234c..76a5473fe38 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt
index d815fe56699..704ab64939e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
index 4a7edf5c529..4faa79da1ef 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
index c8bb4275a4e..17249aab1dc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
index e73842c1a23..8570935eed8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
index 56f85ae86dc..62949eacb34 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
@@ -103,6 +103,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
index 21018c657d8..ac70d55394a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
@@ -104,6 +104,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt
index 22425588e88..afa9598baf8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt
index 1858961b7c8..44425c6c290 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt
index eeffba3d435..711cc5e0eb3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt
index 9cb7245a19e..cf1053ca7f1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt
index c6aa5a3b47f..3beb5da6161 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt
index c2ec5e074b9..06d86d890d3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt
index e727df0cc5c..d4ef1c9bc89 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt
index 22e796901d6..735dfe8edbe 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt
index 5f709fa4d92..e58ad3f22c2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt
index 3a458d86114..30b9d0e7bec 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt
index e6b15028afd..765f9b30c23 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt
index db4a3158c36..d12afc6f3ac 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt
index ae0a2adccdd..afd1d57ec0d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt
index 8db5a38e198..f0d8fd8b626 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt
index 0581fb27cf7..bf16a6c8c04 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt
index 7c8591907e3..004825bbda4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt
index d078882c876..8b656e2f0d0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt
@@ -81,6 +81,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt
index 26e2fc7aa88..39b589e6cfc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt
index 4f401835ebe..f0503f66e34 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt
index e3b152736e3..5fc12fb9590 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt
@@ -87,6 +87,10 @@ tf_class {
     name: "total"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt
index 7be9dc137f2..88901af8550 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt
index 5b13db02962..6827926e564 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt
@@ -78,6 +78,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt
index 3d206f19d51..ccb453c2793 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt
index 4b2456a4107..04c3ded5364 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt
index 0cd7170b644..b81e4c533d5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt
index 27c4143328d..99037be1d16 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt
@@ -81,6 +81,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt
index e1dca78961c..49c09513a5f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
index 10af9442684..d08920f9281 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt
index 2b102c333f1..ab139937f22 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt
index 95ec2ee5664..af188230586 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt
index eb467f2c4c5..e32214c2cce 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt
index b373aab7b04..ecd99a4bfa6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt
index bfc7e92d2e3..5ad20087fa5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt
index 525f16101e3..97c11cb1264 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt
@@ -82,6 +82,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt
index 6cbb05133ae..449914554fd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt
index 6425d8a2260..c38d8ab0eea 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-device-wrapper.pbtxt
index cbc4b47cd65..b8fd91bbfc0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-device-wrapper.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-dropout-wrapper.pbtxt
index 9ca22c44a55..f7a86688d66 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-dropout-wrapper.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-residual-wrapper.pbtxt
index f1597248216..f8854a6f052 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.-r-n-n-cell-residual-wrapper.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "submodules"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index fed938e67bb..656d026cb63 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -604,6 +604,10 @@ tf_module {
     name: "fill"
     argspec: "args=[\'dims\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "fingerprint"
+    argspec: "args=[\'data\', \'method\', \'name\'], varargs=None, keywords=None, defaults=[\'farmhash64\', \'None\'], "
+  }
   member_method {
     name: "floor"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1056,6 +1060,10 @@ tf_module {
     name: "variable_creator_scope"
     argspec: "args=[\'variable_creator\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "vectorized_map"
+    argspec: "args=[\'fn\', \'elems\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "where"
     argspec: "args=[\'condition\', \'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index d2e12dc7156..12e668952bc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -1332,6 +1332,10 @@ tf_module {
     name: "FilterDataset"
     argspec: "args=[\'input_dataset\', \'other_arguments\', \'predicate\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "Fingerprint"
+    argspec: "args=[\'data\', \'method\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "FixedLengthRecordDataset"
     argspec: "args=[\'filenames\', \'header_bytes\', \'record_bytes\', \'footer_bytes\', \'buffer_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -2390,7 +2394,7 @@ tf_module {
   }
   member_method {
     name: "PrefetchDataset"
-    argspec: "args=[\'input_dataset\', \'buffer_size\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input_dataset\', \'buffer_size\', \'output_types\', \'output_shapes\', \'slack_period\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
   }
   member_method {
     name: "Prelinearize"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-manager.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-manager.pbtxt
index 2538de661b3..d981983e938 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-manager.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-manager.pbtxt
@@ -12,7 +12,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'checkpoint\', \'directory\', \'max_to_keep\', \'keep_checkpoint_every_n_hours\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'checkpoint\', \'directory\', \'max_to_keep\', \'keep_checkpoint_every_n_hours\', \'checkpoint_name\'], varargs=None, keywords=None, defaults=[\'None\', \'ckpt\'], "
   }
   member_method {
     name: "save"
diff --git a/tensorflow/tools/api/tests/BUILD b/tensorflow/tools/api/tests/BUILD
index a2440eaebab..7f7748c6ab6 100644
--- a/tensorflow/tools/api/tests/BUILD
+++ b/tensorflow/tools/api/tests/BUILD
@@ -1,5 +1,10 @@
 # TensorFlow API backwards compatibility tests.
 
+load(
+    "//tensorflow:tensorflow.bzl",
+    "py_test",
+)
+
 package(
     default_visibility = ["//tensorflow/tools/api:__subpackages__"],
 )
@@ -22,8 +27,12 @@ py_test(
         "//tensorflow/tools/api/tests:API_UPDATE_WARNING.txt",
         "//tensorflow/tools/api/tests:README.txt",
     ],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
-    tags = ["no_rocm"],
+    tags = [
+        "no_pip",
+        "no_rocm",
+    ],
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
@@ -39,6 +48,7 @@ py_test(
 py_test(
     name = "deprecation_test",
     srcs = ["deprecation_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index 7ed92dc8641..b1529370df9 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -357,17 +357,24 @@ class ApiCompatibilityTest(test.TestCase):
 
   @test_util.run_v1_only('b/120545219')
   def testAPIBackwardsCompatibility(self):
-    api_version = 2 if '_api.v2' in tf.__name__ else 1
+    api_version = 2 if '_api.v2' in tf.bitwise.__name__ else 1
     golden_file_pattern = os.path.join(
         resource_loader.get_root_dir_with_all_resources(),
         _KeyToFilePath('*', api_version))
+    omit_golden_symbols_map = {}
+    if api_version == 2 and FLAGS.only_test_core_api:
+      # In TF 2.0 these summary symbols are imported from TensorBoard.
+      omit_golden_symbols_map['tensorflow.summary'] = [
+          'audio', 'histogram', 'image', 'scalar', 'text']
+
     self._checkBackwardsCompatibility(
         tf,
         golden_file_pattern,
         api_version,
         # Skip compat.v1 and compat.v2 since they are validated
         # in separate tests.
-        additional_private_map={'tf.compat': ['v1', 'v2']})
+        additional_private_map={'tf.compat': ['v1', 'v2']},
+        omit_golden_symbols_map=omit_golden_symbols_map)
 
     # Also check that V1 API has contrib
     self.assertTrue(
diff --git a/tensorflow/tools/ci_build/builds/docker_cpu_pip.sh b/tensorflow/tools/ci_build/builds/docker_cpu_pip.sh
index 342f1674960..ab6c19b9f04 100755
--- a/tensorflow/tools/ci_build/builds/docker_cpu_pip.sh
+++ b/tensorflow/tools/ci_build/builds/docker_cpu_pip.sh
@@ -36,4 +36,9 @@ bazel test --define=no_tensorflow_py_deps=true \
       --test_size_filters=small,medium \
       --test_timeout 300,450,1200,3600 \
       --test_output=errors \
-      -- //${PIP_TEST_ROOT}/tensorflow/python/...
+      -- //${PIP_TEST_ROOT}/tensorflow/python/... \
+      -//${PIP_TEST_ROOT}/tensorflow/python/keras:training_eager_test \
+      -//${PIP_TEST_ROOT}/tensorflow/python/keras:base_layer_test \
+      -//${PIP_TEST_ROOT}/tensorflow/python/distribute:distribute_lib_test \
+      -//${PIP_TEST_ROOT}/tensorflow/python:virtual_gpu_test \
+      -//${PIP_TEST_ROOT}/tensorflow/python:virtual_gpu_test_gpu
diff --git a/tensorflow/tools/ci_build/builds/pip_new.sh b/tensorflow/tools/ci_build/builds/pip_new.sh
index dbbf907ce67..5eb39bff3b4 100755
--- a/tensorflow/tools/ci_build/builds/pip_new.sh
+++ b/tensorflow/tools/ci_build/builds/pip_new.sh
@@ -257,7 +257,7 @@ PYTHON_BIN_PATH_INIT=${PYTHON_BIN_PATH}
 PIP_BIN_PATH="$(which pip${PY_MAJOR_MINOR_VER})"
 
 # PIP packages
-INSTALL_EXTRA_PIP_PACKAGES="portpicker scipy ${TF_BUILD_INSTALL_EXTRA_PIP_PACKAGES}"
+INSTALL_EXTRA_PIP_PACKAGES="portpicker scipy scikit-learn ${TF_BUILD_INSTALL_EXTRA_PIP_PACKAGES}"
 
 ###########################################################################
 # Build TF PIP Package
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 7e9e11dfc91..b5da21cba4e 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -128,8 +128,8 @@ pip2 install --upgrade h5py==2.8.0
 pip3 install --upgrade h5py==2.8.0
 
 # Estimator
-pip2 install tf-estimator-nightly==1.12.0.dev20181203 --no-deps
-pip3 install tf-estimator-nightly==1.12.0.dev20181203 --no-deps
+pip2 install tf-estimator-nightly --no-deps
+pip3 install tf-estimator-nightly --no-deps
 
 # Argparse
 pip2 install --upgrade argparse
diff --git a/tensorflow/tools/common/BUILD b/tensorflow/tools/common/BUILD
index 8c01d15a806..05fc81b0d97 100644
--- a/tensorflow/tools/common/BUILD
+++ b/tensorflow/tools/common/BUILD
@@ -21,6 +21,7 @@ py_library(
 py_test(
     name = "public_api_test",
     srcs = ["public_api_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":public_api",
@@ -38,6 +39,7 @@ py_library(
 py_test(
     name = "traverse_test",
     srcs = ["traverse_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":test_module1",
diff --git a/tensorflow/tools/compatibility/BUILD b/tensorflow/tools/compatibility/BUILD
index 6db6669a547..4640132f1aa 100644
--- a/tensorflow/tools/compatibility/BUILD
+++ b/tensorflow/tools/compatibility/BUILD
@@ -2,6 +2,7 @@ load(
     "//tensorflow:tensorflow.bzl",
     "tf_copts",  # @unused
     "tf_cc_test",  # @unused
+    "py_test",
 )
 
 licenses(["notice"])  # Apache 2.0
@@ -27,6 +28,7 @@ py_library(
 py_test(
     name = "ast_edits_test",
     srcs = ["ast_edits_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":ast_edits",
@@ -39,6 +41,7 @@ py_test(
 py_binary(
     name = "tf_upgrade",
     srcs = ["tf_upgrade.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [":tf_upgrade_lib"],
 )
@@ -53,7 +56,11 @@ py_library(
 py_test(
     name = "tf_upgrade_test",
     srcs = ["tf_upgrade_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+    ],
     deps = [
         ":tf_upgrade_lib",
         "//tensorflow/python:client_testlib",
@@ -84,6 +91,12 @@ py_library(
     deps = [":renames_v2"],
 )
 
+py_library(
+    name = "module_deprecations_v2",
+    srcs = ["module_deprecations_v2.py"],
+    deps = [":ast_edits"],
+)
+
 py_library(
     name = "tf_upgrade_v2_lib",
     srcs = ["tf_upgrade_v2.py"],
@@ -91,26 +104,40 @@ py_library(
     deps = [
         ":all_renames_v2",
         ":ast_edits",
+        ":module_deprecations_v2",
         ":reorders_v2",
         "@six_archive//:six",
     ],
 )
 
+py_library(
+    name = "tf_upgrade_v2_safety_lib",
+    srcs = ["tf_upgrade_v2_safety.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ast_edits",
+        ":module_deprecations_v2",
+    ],
+)
+
 py_binary(
     name = "tf_upgrade_v2",
     srcs = ["tf_upgrade_v2_main.py"],
     main = "tf_upgrade_v2_main.py",
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":ast_edits",
         ":ipynb",
         ":tf_upgrade_v2_lib",
+        ":tf_upgrade_v2_safety_lib",
     ],
 )
 
 py_test(
     name = "tf_upgrade_v2_test",
     srcs = ["tf_upgrade_v2_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":tf_upgrade_v2_lib",
@@ -124,6 +151,18 @@ py_test(
     ],
 )
 
+py_test(
+    name = "tf_upgrade_v2_safety_test",
+    srcs = ["tf_upgrade_v2_safety_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tf_upgrade_v2_safety_lib",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "@six_archive//:six",
+    ],
+)
+
 # Keep for reference, this test will succeed in 0.11 but fail in 1.0
 # py_test(
 #     name = "test_file_v0_11",
@@ -154,6 +193,7 @@ py_test(
     name = "test_file_v1_0",
     size = "small",
     srcs = ["test_file_v1_0.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
@@ -179,6 +219,7 @@ py_test(
     name = "test_file_v1_12",
     size = "small",
     srcs = ["testdata/test_file_v1_12.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
@@ -189,6 +230,7 @@ py_test(
     name = "test_file_v2_0",
     size = "small",
     srcs = ["test_file_v2_0.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
diff --git a/tensorflow/tools/compatibility/module_deprecations_v2.py b/tensorflow/tools/compatibility/module_deprecations_v2.py
new file mode 100644
index 00000000000..ba542954a1d
--- /dev/null
+++ b/tensorflow/tools/compatibility/module_deprecations_v2.py
@@ -0,0 +1,64 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Module deprecation warnings for TensorFlow 2.0."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.tools.compatibility import ast_edits
+
+
+_CONTRIB_WARNING = (
+    ast_edits.ERROR,
+    "<function name> cannot be converted automatically. tf.contrib will not"
+    " be distributed with TensorFlow 2.0, please consider an alternative in"
+    " non-contrib TensorFlow, a community-maintained repository, or fork "
+    "the required code."
+)
+
+_FLAGS_WARNING = (
+    ast_edits.ERROR,
+    "tf.flags has been removed, please use the argparse or absl"
+    " modules if you need command line parsing."
+)
+
+_CONTRIB_CUDNN_RNN_WARNING = (
+    ast_edits.WARNING,
+    "(Manual edit required) tf.contrib.cudnn_rnn.* has been deprecated, "
+    "and the CuDNN kernel has been integrated with "
+    "tf.keras.layers.LSTM/GRU in TensorFlow 2.0. Please check the new API "
+    "and use that instead."
+)
+
+_CONTRIB_RNN_WARNING = (
+    ast_edits.WARNING,
+    "(Manual edit required) tf.contrib.rnn.* has been deprecated, and "
+    "widely used cells/functions will be moved to tensorflow/addons "
+    "repository. Please check it there and file Github issues if necessary."
+)
+
+_CONTRIB_DIST_STRAT_WARNING = (
+    ast_edits.WARNING,
+    "(Manual edit required) tf.contrib.distribute.* have been migrated to"
+    "tf.distribute.*. Please check out the new module for updates APIs.")
+
+MODULE_DEPRECATIONS = {
+    "tf.contrib": _CONTRIB_WARNING,
+    "tf.contrib.cudnn_rnn": _CONTRIB_CUDNN_RNN_WARNING,
+    "tf.contrib.rnn": _CONTRIB_RNN_WARNING,
+    "tf.flags": _FLAGS_WARNING,
+    "tf.contrib.distribute": _CONTRIB_DIST_STRAT_WARNING
+}
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2.py b/tensorflow/tools/compatibility/tf_upgrade_v2.py
index e446f674dd7..e55ad592bff 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2.py
@@ -27,6 +27,7 @@ import pasta
 
 from tensorflow.tools.compatibility import all_renames_v2
 from tensorflow.tools.compatibility import ast_edits
+from tensorflow.tools.compatibility import module_deprecations_v2
 from tensorflow.tools.compatibility import reorders_v2
 
 # These pylint warnings are a mistake.
@@ -622,34 +623,6 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
     self.function_reorders = dict(reorders_v2.reorders)
     self.function_reorders.update(self.manual_function_reorders)
 
-    contrib_warning = (
-        ast_edits.ERROR,
-        "<function name> cannot be converted automatically. tf.contrib will not"
-        " be distributed with TensorFlow 2.0, please consider an alternative in"
-        " non-contrib TensorFlow, a community-maintained repository, or fork "
-        "the required code."
-    )
-
-    flags_warning = (
-        ast_edits.ERROR,
-        "tf.flags has been removed, please use the argparse or absl"
-        " modules if you need command line parsing.")
-
-    contrib_cudnn_rnn_warning = (
-        ast_edits.WARNING,
-        "(Manual edit required) tf.contrib.cudnn_rnn.* has been deprecated, "
-        "and the CuDNN kernel has been integrated with "
-        "tf.keras.layers.LSTM/GRU in TensorFlow 2.0. Please check the new API "
-        "and use that instead."
-    )
-
-    contrib_rnn_warning = (
-        ast_edits.WARNING,
-        "(Manual edit required) tf.contrib.rnn.* has been deprecated, and "
-        "widely used cells/functions will be moved to tensorflow/addons "
-        "repository. Please check it there and file Github issues if necessary."
-    )
-
     decay_function_comment = (
         ast_edits.INFO,
         "To use learning rate decay schedules with TensorFlow 2.0, switch to "
@@ -683,10 +656,9 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
     )
 
     initializers_no_dtype_comment = (
-        ast_edits.INFO,
-        "Initializers no longer have the "
+        ast_edits.INFO, "Initializers no longer have the "
         "dtype argument in the constructor or partition_info argument in the "
-        "__call__ method.\nThe calls have been converted to compat.v1 for"
+        "__call__ method.\nThe calls have been converted to compat.v1 for "
         "safety (even though they may already have been correct).")
 
     metrics_comment = (
@@ -791,11 +763,6 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "default, instead of HDF5. To continue saving to HDF5, add the "
         "argument save_format='h5' to the save() function.")
 
-    contrib_dist_strat_warning = (
-        ast_edits.WARNING,
-        "(Manual edit required) tf.contrib.distribute.* have been migrated to"
-        "tf.distribute.*. Please check out the new module for updates APIs.")
-
     distribute_strategy_api_changes = (
         "If you're using the strategy with a "
         "custom training loop, note the following changes in methods: "
@@ -1505,13 +1472,7 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             arg_value_ast=ast.Str("h5")),
     }
 
-    self.module_deprecations = {
-        "tf.contrib": contrib_warning,
-        "tf.contrib.cudnn_rnn": contrib_cudnn_rnn_warning,
-        "tf.contrib.rnn": contrib_rnn_warning,
-        "tf.flags": flags_warning,
-        "tf.contrib.distribute": contrib_dist_strat_warning
-    }
+    self.module_deprecations = module_deprecations_v2.MODULE_DEPRECATIONS
 
 
 def _is_ast_str(node):
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_main.py b/tensorflow/tools/compatibility/tf_upgrade_v2_main.py
index 36e30f559e3..3c4263ed809 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_main.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_main.py
@@ -19,11 +19,22 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
+import sys
 
 from tensorflow.tools.compatibility import ast_edits
 from tensorflow.tools.compatibility import tf_upgrade_v2
+from tensorflow.tools.compatibility import tf_upgrade_v2_safety
 from tensorflow.tools.compatibility import ipynb
 
+# Make straightforward changes to convert to 2.0. In harder cases,
+# use compat.v1.
+_DEFAULT_MODE = "DEFAULT"
+
+# Convert to use compat.v1.
+# TODO(kaftan): remove EXPERIMENTAL_ prefix once safety mode is
+# implemented.
+_SAFETY_MODE = "EXPERIMENTAL_SAFETY"
+
 
 def process_file(in_filename, out_filename, upgrader):
   """Process a file of type `.py` or `.ipynb`."""
@@ -91,9 +102,27 @@ Simple usage:
             "stored."
             "(default: %(default)s)"),
       default="report.txt")
+  parser.add_argument(
+      "--mode",
+      dest="mode",
+      choices=[_DEFAULT_MODE, _SAFETY_MODE],
+      help=("Upgrade script mode. Supported modes:\n"
+            "%s: Perform only straightforward conversions to upgrade to "
+            "2.0. In more difficult cases, switch to use compat.v1.\n"
+            "%s: Keep 1.* code intact and import compat.v1 "
+            "module. Note: safety mode is under development and not available "
+            "yet." % (_DEFAULT_MODE, _SAFETY_MODE)),
+      default=_DEFAULT_MODE)
   args = parser.parse_args()
 
-  upgrade = ast_edits.ASTCodeUpgrader(tf_upgrade_v2.TFAPIChangeSpec())
+  if args.mode == _SAFETY_MODE:
+    change_spec = tf_upgrade_v2_safety.TFAPIChangeSpec()
+    sys.stderr.write(
+        "%s mode is not fully implemented yet." % _SAFETY_MODE)
+  else:
+    change_spec = tf_upgrade_v2.TFAPIChangeSpec()
+  upgrade = ast_edits.ASTCodeUpgrader(change_spec)
+
   report_text = None
   report_filename = args.report_filename
   files_processed = 0
diff --git a/tensorflow/examples/saved_model/integration_tests/run_script.py b/tensorflow/tools/compatibility/tf_upgrade_v2_safety.py
similarity index 53%
rename from tensorflow/examples/saved_model/integration_tests/run_script.py
rename to tensorflow/tools/compatibility/tf_upgrade_v2_safety.py
index 438df40d340..02ade7bb812 100644
--- a/tensorflow/examples/saved_model/integration_tests/run_script.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_safety.py
@@ -12,25 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utility to create a single py_binary that can call multiple py_binaries.
-
-This simulates executing a python script by importing a module name by the
-environment 'SCRIPT_NAME' and executing its main via `app.run`.
-"""
+"""Upgrader for Python scripts from 1.* to 2.0 TensorFlow using SAFETY mode."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import importlib
-import os
-import sys
-
-from absl import app
+from tensorflow.tools.compatibility import ast_edits
+from tensorflow.tools.compatibility import module_deprecations_v2
 
 
-if __name__ == '__main__':
-  # Append current path to import path and execute `SCRIPT_NAME` main.
-  sys.path.extend([os.path.dirname(__file__)])
-  module_name = os.environ['SCRIPT_NAME']
-  app.run(importlib.import_module(module_name).main)
+class TFAPIChangeSpec(ast_edits.APIChangeSpec):
+  """List of maps that describe what changed in the API."""
+
+  def __init__(self):
+    self.function_keyword_renames = {}
+    self.symbol_renames = {}
+    self.change_to_function = {}
+    self.function_reorders = {}
+    self.function_warnings = {}
+    self.function_transformers = {}
+    self.module_deprecations = module_deprecations_v2.MODULE_DEPRECATIONS
+
+    # TODO(kaftan,annarev): specify replacement from TensorFlow import to
+    # compat.v1 import.
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_safety_test.py b/tensorflow/tools/compatibility/tf_upgrade_v2_safety_test.py
new file mode 100644
index 00000000000..8890d631c34
--- /dev/null
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_safety_test.py
@@ -0,0 +1,47 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf 2.0 upgrader in safety mode."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test as test_lib
+from tensorflow.tools.compatibility import ast_edits
+from tensorflow.tools.compatibility import tf_upgrade_v2_safety
+
+
+class TfUpgradeV2SafetyTest(test_util.TensorFlowTestCase):
+
+  def _upgrade(self, old_file_text):
+    in_file = six.StringIO(old_file_text)
+    out_file = six.StringIO()
+    upgrader = ast_edits.ASTCodeUpgrader(tf_upgrade_v2_safety.TFAPIChangeSpec())
+    count, report, errors = (
+        upgrader.process_opened_file("test.py", in_file,
+                                     "test_out.py", out_file))
+    return count, report, errors, out_file.getvalue()
+
+  def testContribWarning(self):
+    text = "tf.contrib.foo()"
+    _, report, _, _ = self._upgrade(text)
+    expected_info = "tf.contrib will not be distributed"
+    self.assertIn(expected_info, report)
+
+
+if __name__ == "__main__":
+  test_lib.main()
diff --git a/tensorflow/tools/compatibility/update/BUILD b/tensorflow/tools/compatibility/update/BUILD
index 593603cd039..1758e0ec9f9 100644
--- a/tensorflow/tools/compatibility/update/BUILD
+++ b/tensorflow/tools/compatibility/update/BUILD
@@ -5,6 +5,7 @@ package(default_visibility = ["//visibility:private"])
 py_binary(
     name = "generate_v2_renames_map",
     srcs = ["generate_v2_renames_map.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
@@ -19,6 +20,7 @@ py_binary(
 py_binary(
     name = "generate_v2_reorders_map",
     srcs = ["generate_v2_reorders_map.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
diff --git a/tensorflow/tools/git/BUILD b/tensorflow/tools/git/BUILD
index 34a51679485..580781d79d0 100644
--- a/tensorflow/tools/git/BUILD
+++ b/tensorflow/tools/git/BUILD
@@ -9,5 +9,6 @@ licenses(["notice"])  # Apache 2.0
 py_binary(
     name = "gen_git_source",
     srcs = ["gen_git_source.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
 )
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 8303da86303..87f76766880 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -61,7 +61,6 @@ COMMON_PIP_DEPS = [
     ":included_headers",
     "//tensorflow:tensorflow_py",
     "//tensorflow/examples/saved_model/integration_tests:mnist_util",
-    "//tensorflow/examples/saved_model/integration_tests:util",
     "//tensorflow/lite/python/testdata:interpreter_test_data",
     "//tensorflow/lite/python:tflite_convert",
     "//tensorflow/lite/toco/python:toco_from_protos",
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index cedf1491904..0e7240f9ccd 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -53,7 +53,7 @@ REQUIRED_PACKAGES = [
     'absl-py >= 0.7.0',
     'astor >= 0.6.0',
     'gast >= 0.2.0',
-    'google_pasta >= 0.1.2',
+    'google_pasta >= 0.1.6',
     'keras_applications >= 1.0.6',
     'keras_preprocessing >= 1.0.5',
     'numpy >= 1.14.5, < 2.0',
diff --git a/tensorflow/tools/tensorflow_builder/BUILD b/tensorflow/tools/tensorflow_builder/BUILD
index 0935dc5c858..2798cbc4d34 100644
--- a/tensorflow/tools/tensorflow_builder/BUILD
+++ b/tensorflow/tools/tensorflow_builder/BUILD
@@ -15,6 +15,7 @@ py_binary(
     data = [
         "//tensorflow/tools/tensorflow_builder/data/golden:cuda_cc_golden",
     ],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":cuda_compute_capability",
@@ -26,6 +27,7 @@ py_binary(
 py_binary(
     name = "cuda_compute_capability",
     srcs = ["data/cuda_compute_capability.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "@absl_py//absl:app",
diff --git a/tensorflow/tools/test/BUILD b/tensorflow/tools/test/BUILD
index ef12226ec00..0d26c0198ef 100644
--- a/tensorflow/tools/test/BUILD
+++ b/tensorflow/tools/test/BUILD
@@ -32,6 +32,7 @@ py_library(
 py_binary(
     name = "system_info",
     srcs = ["system_info.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":system_info_lib",
@@ -55,6 +56,7 @@ py_library(
 py_binary(
     name = "run_and_gather_logs",
     srcs = ["run_and_gather_logs.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [":run_and_gather_logs_main_lib"],
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 850fe8c88d8..3eee224800a 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -137,11 +137,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "com_google_absl",
         build_file = clean_dep("//third_party:com_google_absl.BUILD"),
-        sha256 = "01ba1185a0e6e048e4890f39e383515195bc335f0627cdddc0c325ee68be4434",
-        strip_prefix = "abseil-cpp-cd86d0d20ab167c33b23d3875db68d1d4bad3a3b",
+        sha256 = "c44f5a87695925aa0c9c4a207c7b4d77c21011f9627717337827fe25ccb867a2",
+        strip_prefix = "abseil-cpp-0cbdc774b97f7e80ab60dbe2ed4eaca3b2e33fc8",
         urls = [
-            "http://mirror.tensorflow.org/github.com/abseil/abseil-cpp/archive/cd86d0d20ab167c33b23d3875db68d1d4bad3a3b.tar.gz",
-            "https://github.com/abseil/abseil-cpp/archive/cd86d0d20ab167c33b23d3875db68d1d4bad3a3b.tar.gz",
+            "http://mirror.tensorflow.org/github.com/abseil/abseil-cpp/archive/0cbdc774b97f7e80ab60dbe2ed4eaca3b2e33fc8.tar.gz",
+            "https://github.com/abseil/abseil-cpp/archive/0cbdc774b97f7e80ab60dbe2ed4eaca3b2e33fc8.tar.gz",
         ],
     )
 
@@ -149,11 +149,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
         patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"),
-        sha256 = "39ddb7d05b5b49f7d2485cdfc8be4d4b528d40ae6524821a2b91d464f6e317b9",
-        strip_prefix = "eigen-eigen-8adbe5681ed1",
+        sha256 = "74845ea27e19a1bcf63f3f271de62e06798f23e0467bb9d45b83a94918941b23",
+        strip_prefix = "eigen-eigen-20cbc6576426",
         urls = [
-            "http://mirror.tensorflow.org/bitbucket.org/eigen/eigen/get/8adbe5681ed1.tar.gz",
-            "https://bitbucket.org/eigen/eigen/get/8adbe5681ed1.tar.gz",
+            "http://mirror.tensorflow.org/bitbucket.org/eigen/eigen/get/20cbc6576426.tar.gz",
+            "https://bitbucket.org/eigen/eigen/get/20cbc6576426.tar.gz",
         ],
     )
 
@@ -192,15 +192,15 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "com_github_googlecloudplatform_google_cloud_cpp",
-        sha256 = "f5600fdf3efd28e3142a60c20574e349511104fc6f658faf7974f6ae2def245a",
-        strip_prefix = "google-cloud-cpp-0.8.1",
+        sha256 = "a072103546cfa041ad8bfc599fe5a20c58e005a1a0ee18e94b2554dc3d485604",
+        strip_prefix = "google-cloud-cpp-0.9.0",
         system_build_file = clean_dep("//third_party/systemlibs:google_cloud_cpp.BUILD"),
         system_link_files = {
             "//third_party/systemlibs:google_cloud_cpp.google.cloud.bigtable.BUILD": "google/cloud/bigtable/BUILD",
         },
         urls = [
-            "http://mirror.tensorflow.org/github.com/googleapis/google-cloud-cpp/archive/v0.8.1.tar.gz",
-            "https://github.com/googleapis/google-cloud-cpp/archive/v0.8.1.tar.gz",
+            "http://mirror.tensorflow.org/github.com/googleapis/google-cloud-cpp/archive/v0.9.0.tar.gz",
+            "https://github.com/googleapis/google-cloud-cpp/archive/v0.9.0.tar.gz",
         ],
     )
 
@@ -834,11 +834,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "tflite_ovic_testdata",
         build_file = clean_dep("//third_party:tflite_ovic_testdata.BUILD"),
-        sha256 = "21288dccc517acee47fa9648d4d3da28bf0fef5381911ed7b4d2ee36366ffa20",
+        sha256 = "033c941b7829b05ca55a124a26a6a0581b1ececc154a2153cafcfdb54f80dca2",
         strip_prefix = "ovic",
         urls = [
-            "http://mirror.tensorflow.org/storage.googleapis.com/download.tensorflow.org/data/ovic_2018_10_23.zip",
-            "https://storage.googleapis.com/download.tensorflow.org/data/ovic_2018_10_23.zip",
+            "http://mirror.tensorflow.org/storage.googleapis.com/download.tensorflow.org/data/ovic_2019_04_30.zip",
+            "https://storage.googleapis.com/download.tensorflow.org/data/ovic_2019_04_30.zip",
         ],
     )
 
diff --git a/third_party/FP16/BUILD.bazel b/third_party/FP16/BUILD.bazel
index b2bb250a15d..e1018beb443 100644
--- a/third_party/FP16/BUILD.bazel
+++ b/third_party/FP16/BUILD.bazel
@@ -9,7 +9,7 @@ exports_files(["LICENSE"])
 
 cc_library(
     name = "FP16",
-    hdrs = ["include/fp16.h"],
+    hdrs = glob(["include/**/*.h"]),
     includes = ["include"],
     strip_include_prefix = "include",
 )
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
index 3e404d5dff7..80cc8e45ccf 100644
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@@ -28,36 +28,45 @@ config_setting(
 config_setting(
     name = "darwin",
     values = {"cpu": "darwin"},
-    visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "freebsd",
     values = {"cpu": "freebsd"},
-    visibility = ["//visibility:public"],
 )
 
+# Provides CUDA headers for '#include "third_party/gpus/cuda/include/cuda.h"'
+# All clients including TensorFlow should use these directives.
 cc_library(
-    name = "cuda_headers",
+    name = "cuda_virtual_headers",
     hdrs = [
         "cuda/cuda_config.h",
-        %{cuda_headers}
+        ":cuda-include"
+    ],
+    include_prefix = "third_party/gpus",
+    visibility = ["//visibility:private"],
+)
+
+# Provides CUDA headers for '#include <cuda.h>'.
+# CUDA itself as well as Eigen use these directives.
+cc_library(
+    name = "cuda_headers",
+    textual_hdrs = [
+        # TODO(csigg): change references to third_party/gpus/cuda/cuda_config.h
+        # (e.g. in the PIP build script) and then remove cuda_config.h.
+        "cuda/cuda_config.h",
+        ":cuda-include"
     ],
     includes = [
-        ".",
+        ".",  # required to include cuda/cuda/cuda_config.h as cuda/config.h
         "cuda/include",
-        "cuda/include/crt",
     ],
-    visibility = ["//visibility:public"],
+    deps = [":cuda_virtual_headers"],
 )
 
 cc_library(
     name = "cudart_static",
     srcs = ["cuda/lib/%{cudart_static_lib}"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkopts = select({
         ":freebsd": [],
         "//conditions:default": ["-ldl"],
@@ -65,104 +74,82 @@ cc_library(
         "-lpthread",
         %{cudart_static_linkopt}
     ],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cuda_driver",
     srcs = ["cuda/lib/%{cuda_driver_lib}"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cudart",
     srcs = ["cuda/lib/%{cudart_lib}"],
     data = ["cuda/lib/%{cudart_lib}"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkstatic = 1,
-    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cublas_virtual_headers",
+    hdrs = [":cublas-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    strip_include_prefix = "cublas/include",
+    visibility = ["//visibility:private"],
+    deps = [":cuda_headers"],
+)
+
+cc_library(
+    name = "cublas_headers",
+    textual_hdrs = [":cublas-include"],
+    includes = ["cublas/include"],
+    deps = [":cublas_virtual_headers"],
 )
 
 cc_library(
     name = "cublas",
     srcs = ["cuda/lib/%{cublas_lib}"],
     data = ["cuda/lib/%{cublas_lib}"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkstatic = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cusolver",
     srcs = ["cuda/lib/%{cusolver_lib}"],
     data = ["cuda/lib/%{cusolver_lib}"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkopts = ["-lgomp"],
     linkstatic = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cudnn",
     srcs = ["cuda/lib/%{cudnn_lib}"],
     data = ["cuda/lib/%{cudnn_lib}"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkstatic = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cudnn_header",
-    includes = [
-        ".",
-        "cuda/include",
-    ],
-    visibility = ["//visibility:public"],
+    hdrs = [":cudnn-include"],
+    include_prefix = "third_party/gpus/cudnn",
+    strip_include_prefix = "cudnn/include",
+    deps = [":cuda_headers"],
 )
 
 cc_library(
     name = "cufft",
     srcs = ["cuda/lib/%{cufft_lib}"],
     data = ["cuda/lib/%{cufft_lib}"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkstatic = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "curand",
     srcs = ["cuda/lib/%{curand_lib}"],
     data = ["cuda/lib/%{curand_lib}"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkstatic = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cuda",
-    visibility = ["//visibility:public"],
     deps = [
         ":cublas",
         ":cuda_headers",
@@ -173,46 +160,37 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "cupti_virtual_headers",
+    hdrs = [":cuda-extras"],
+    include_prefix="third_party/gpus",
+    visibility = ["//visibility:private"],
+    deps = [":cuda_headers"],
+)
+
 cc_library(
     name = "cupti_headers",
-    hdrs = [
-        "cuda/cuda_config.h",
-        ":cuda-extras",
-    ],
-    includes = [
-        ".",
-        "cuda/extras/CUPTI/include/",
-    ],
-    visibility = ["//visibility:public"],
+    textual_hdrs = [":cuda-extras"],
+    includes = ["cuda/extras/CUPTI/include/"],
+    deps = [":cupti_virtual_headers"],
 )
 
 cc_library(
     name = "cupti_dsos",
     data = ["cuda/lib/%{cupti_lib}"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cusparse",
     srcs = ["cuda/lib/%{cusparse_lib}"],
     data = ["cuda/lib/%{cusparse_lib}"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkopts = ["-lgomp"],
     linkstatic = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "libdevice_root",
     data = [":cuda-nvvm"],
-    visibility = ["//visibility:public"],
 )
 
 %{copy_rules}
diff --git a/third_party/gpus/cuda/BUILD.windows.tpl b/third_party/gpus/cuda/BUILD.windows.tpl
index a8dc7228394..52748fd04bf 100644
--- a/third_party/gpus/cuda/BUILD.windows.tpl
+++ b/third_party/gpus/cuda/BUILD.windows.tpl
@@ -28,27 +28,41 @@ config_setting(
 config_setting(
     name = "darwin",
     values = {"cpu": "darwin"},
-    visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "freebsd",
     values = {"cpu": "freebsd"},
-    visibility = ["//visibility:public"],
 )
 
+# Provides CUDA headers for '#include "third_party/gpus/cuda/include/cuda.h"'
+# All clients including TensorFlow should use these directives.
 cc_library(
-    name = "cuda_headers",
+    name = "cuda_virtual_headers",
     hdrs = [
         "cuda/cuda_config.h",
-        %{cuda_headers}
+        ":cuda-include"
+    ],
+    include_prefix = "third_party/gpus",
+    visibility = ["//visibility:private"],
+)
+
+# Provides CUDA headers for '#include <cuda.h>'.
+# CUDA itself as well as Eigen use these directives.
+cc_library(
+    name = "cuda_headers",
+    textual_hdrs = [
+        # TODO(csigg): change references to third_party/gpus/cuda/cuda_config.h
+        # (e.g. in the PIP build script) and then remove cuda_config.h.
+        "cuda/cuda_config.h",
+        ":cuda-include"
     ],
     includes = [
-        ".",
+        ".",  # required to include cuda/cuda/cuda_config.h as cuda/config.h
         "cuda/include",
-        "cuda/include/crt",
     ],
-    visibility = ["//visibility:public"],
+)
+    deps = [":cuda_virtual_headers"],
 )
 
 cc_import(
@@ -60,70 +74,76 @@ cc_import(
     # TODO(pcloudy): Remove this rule after b/111278841 is resolved.
     interface_library = "cuda/lib/%{cudart_static_lib}",
     system_provided = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_import(
     name = "cuda_driver",
     interface_library = "cuda/lib/%{cuda_driver_lib}",
     system_provided = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_import(
     name = "cudart",
     interface_library = "cuda/lib/%{cudart_lib}",
     system_provided = 1,
-    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cublas_virtual_headers",
+    hdrs = [":cublas-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    strip_include_prefix = "cublas/include",
+    visibility = ["//visibility:private"],
+    deps = [":cuda_headers"],
+)
+
+cc_library(
+    name = "cublas_headers",
+    textual_hdrs = [":cublas-include"],
+    includes = ["cublas/include"],
+    deps = [":cublas_virtual_headers"],
 )
 
 cc_import(
     name = "cublas",
     interface_library = "cuda/lib/%{cublas_lib}",
     system_provided = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_import(
     name = "cusolver",
     interface_library = "cuda/lib/%{cusolver_lib}",
     system_provided = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_import(
     name = "cudnn",
     interface_library = "cuda/lib/%{cudnn_lib}",
     system_provided = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cudnn_header",
-    includes = [
-        ".",
-        "cuda/include",
-    ],
-    visibility = ["//visibility:public"],
+    hdrs = [":cudnn-include"],
+    include_prefix = "third_party/gpus/cudnn",
+    strip_include_prefix = "cudnn/include",
+    deps = [":cuda_headers"],
 )
 
 cc_import(
     name = "cufft",
     interface_library = "cuda/lib/%{cufft_lib}",
     system_provided = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_import(
     name = "curand",
     interface_library = "cuda/lib/%{curand_lib}",
     system_provided = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cuda",
-    visibility = ["//visibility:public"],
     deps = [
         ":cublas",
         ":cuda_headers",
@@ -134,38 +154,36 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "cupti_virtual_headers",
+    hdrs = [":cuda-extras"],
+    include_prefix="third_party/gpus",
+    visibility = ["//visibility:private"],
+    deps = [":cuda_headers"],
+)
+
 cc_library(
     name = "cupti_headers",
-    hdrs = [
-        "cuda/cuda_config.h",
-        ":cuda-extras",
-    ],
-    includes = [
-        ".",
-        "cuda/",
-        "cuda/extras/CUPTI/include/",
-    ],
-    visibility = ["//visibility:public"],
+    textual_hdrs = [":cuda-extras"],
+    includes = ["cuda/extras/CUPTI/include/"],
+    deps = [":cupti_virtual_headers"],
 )
 
 cc_import(
     name = "cupti_dsos",
     interface_library = "cuda/lib/%{cupti_lib}",
     system_provided = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_import(
     name = "cusparse",
     interface_library = "cuda/lib/%{cusparse_lib}",
     system_provided = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "libdevice_root",
     data = [":cuda-nvvm"],
-    visibility = ["//visibility:public"],
 )
 
 %{copy_rules}
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index b2e3f669d76..f08cca6ebb6 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -784,8 +784,11 @@ def _create_dummy_repository(repository_ctx):
             "%{curand_lib}": lib_name("curand", cpu_value),
             "%{cupti_lib}": lib_name("cupti", cpu_value),
             "%{cusparse_lib}": lib_name("cusparse", cpu_value),
-            "%{copy_rules}": "",
-            "%{cuda_headers}": "",
+            "%{copy_rules}": """
+filegroup(name="cuda-include")
+filegroup(name="cublas-include")
+filegroup(name="cudnn-include")
+""",
         },
     )
 
@@ -983,25 +986,21 @@ def _create_local_cuda_repository(repository_ctx):
             out_dir = "cuda/extras/CUPTI/include",
         ),
     ]
-    included_files = _read_dir(repository_ctx, cuda_include_path)
 
-    if not any([file.endswith("cublas.h") for file in included_files]):
-        copy_rules.append(make_copy_files_rule(
-            repository_ctx,
-            name = "cublas-include",
-            srcs = [
-                cublas_include_path + "/cublas.h",
-                cublas_include_path + "/cublas_v2.h",
-                cublas_include_path + "/cublas_api.h",
-            ],
-            outs = [
-                "cuda/include/cublas.h",
-                "cuda/include/cublas_v2.h",
-                "cuda/include/cublas_api.h",
-            ],
-        ))
-    else:
-        copy_rules.append("filegroup(name = 'cublas-include')\n")
+    copy_rules.append(make_copy_files_rule(
+        repository_ctx,
+        name = "cublas-include",
+        srcs = [
+            cublas_include_path + "/cublas.h",
+            cublas_include_path + "/cublas_v2.h",
+            cublas_include_path + "/cublas_api.h",
+        ],
+        outs = [
+            "cublas/include/cublas.h",
+            "cublas/include/cublas_v2.h",
+            "cublas/include/cublas_api.h",
+        ],
+    ))
 
     cuda_libs = _find_libs(repository_ctx, cuda_config)
     cuda_lib_srcs = []
@@ -1023,16 +1022,12 @@ def _create_local_cuda_repository(repository_ctx):
         out_dir = "cuda/bin",
     ))
 
-    # Copy cudnn.h if cuDNN was not installed to CUDA_TOOLKIT_PATH.
-    if not any([file.endswith("cudnn.h") for file in included_files]):
-        copy_rules.append(make_copy_files_rule(
-            repository_ctx,
-            name = "cudnn-include",
-            srcs = [cudnn_header_dir + "/cudnn.h"],
-            outs = ["cuda/include/cudnn.h"],
-        ))
-    else:
-        copy_rules.append("filegroup(name = 'cudnn-include')\n")
+    copy_rules.append(make_copy_files_rule(
+        repository_ctx,
+        name = "cudnn-include",
+        srcs = [cudnn_header_dir + "/cudnn.h"],
+        outs = ["cudnn/include/cudnn.h"],
+    ))
 
     # Set up BUILD file for cuda/
     _tpl(
@@ -1062,11 +1057,6 @@ def _create_local_cuda_repository(repository_ctx):
             "%{cupti_lib}": cuda_libs["cupti"].basename,
             "%{cusparse_lib}": cuda_libs["cusparse"].basename,
             "%{copy_rules}": "\n".join(copy_rules),
-            "%{cuda_headers}": (
-                '":cuda-include",\n' +
-                '        ":cublas-include",' +
-                '        ":cudnn-include",'
-            ),
         },
         "cuda/BUILD",
     )
diff --git a/third_party/gpus/find_cuda_config.py b/third_party/gpus/find_cuda_config.py
index 7662e9e46ae..d948593e518 100644
--- a/third_party/gpus/find_cuda_config.py
+++ b/third_party/gpus/find_cuda_config.py
@@ -53,6 +53,7 @@ tf_<library>_header_dir: ...
 tf_<library>_library_dir: ...
 """
 
+import io
 import os
 import glob
 import platform
@@ -119,7 +120,7 @@ def _at_least_version(actual_version, required_version):
 
 def _get_header_version(path, name):
   """Returns preprocessor defines in C header file."""
-  for line in open(path, "r").readlines():
+  for line in io.open(path, "r", encoding="utf-8").readlines():
     match = re.match("#define %s +(\d+)" % name, line)
     if match:
       return match.group(1)
@@ -388,6 +389,13 @@ def _find_tensorrt_config(base_paths, required_version):
   header_path, header_version = _find_header(base_paths, "NvInfer.h",
                                              required_version,
                                              get_header_version)
+
+  if ".." in header_version:
+    # From TRT 6.0 onwards, version information has been moved to NvInferVersion.h.
+    header_path, header_version = _find_header(base_paths, "NvInferVersion.h",
+                                               required_version,
+                                               get_header_version)
+
   tensorrt_version = header_version.split(".")[0]
 
   library_path = _find_library(base_paths, "nvinfer", tensorrt_version)
@@ -406,6 +414,20 @@ def _list_from_env(env_name, default=[]):
   return default
 
 
+def _get_legacy_path(env_name, default=[]):
+  """Returns a path specified by a legacy environment variable.
+
+  CUDNN_INSTALL_PATH, NCCL_INSTALL_PATH, TENSORRT_INSTALL_PATH set to
+  '/usr/lib/x86_64-linux-gnu' would previously find both library and header
+  paths. Detect those and return '/usr', otherwise forward to _list_from_env().
+  """
+  if env_name in os.environ:
+    match = re.match("^(/[^/ ]*)+/lib/\w+-linux-gnu/?$", os.environ[env_name])
+    if match:
+      return [match.group(1)]
+  return _list_from_env(env_name, default)
+
+
 def _normalize_path(path):
   """Returns normalized path, with forward slashes on Windows."""
   path = os.path.normpath(path)
@@ -427,27 +449,27 @@ def find_cuda_config():
     cuda_paths = _list_from_env("CUDA_TOOLKIT_PATH", base_paths)
     result.update(_find_cuda_config(cuda_paths, cuda_version))
 
-    cublas_paths = _list_from_env("CUBLAS_INSTALL_PATH", base_paths)
-    # Add cuda paths in case CuBLAS is installed under CUDA_TOOLKIT_PATH.
-    cublas_paths += list(set(cuda_paths) - set(cublas_paths))
     cuda_version = result["cuda_version"]
+    cublas_paths = base_paths
+    if tuple(int(v) for v in cuda_version.split(".")) < (10, 1):
+      # Before CUDA 10.1, cuBLAS was in the same directory as the toolkit.
+      cublas_paths = cuda_paths
     cublas_version = os.environ.get("TF_CUBLAS_VERSION", "")
     result.update(
         _find_cublas_config(cublas_paths, cublas_version, cuda_version))
 
   if "cudnn" in libraries:
-    cudnn_paths = _list_from_env("CUDNN_INSTALL_PATH", base_paths)
+    cudnn_paths = _get_legacy_path("CUDNN_INSTALL_PATH", base_paths)
     cudnn_version = os.environ.get("TF_CUDNN_VERSION", "")
     result.update(_find_cudnn_config(cudnn_paths, cudnn_version))
 
   if "nccl" in libraries:
-    nccl_paths = _list_from_env("NCCL_INSTALL_PATH",
-                                base_paths) + _list_from_env("NCCL_HDR_PATH")
+    nccl_paths = _get_legacy_path("NCCL_INSTALL_PATH", base_paths)
     nccl_version = os.environ.get("TF_NCCL_VERSION", "")
     result.update(_find_nccl_config(nccl_paths, nccl_version))
 
   if "tensorrt" in libraries:
-    tensorrt_paths = _list_from_env("TENSORRT_INSTALL_PATH", base_paths)
+    tensorrt_paths = _get_legacy_path("TENSORRT_INSTALL_PATH", base_paths)
     tensorrt_version = os.environ.get("TF_TENSORRT_VERSION", "")
     result.update(_find_tensorrt_config(tensorrt_paths, tensorrt_version))
 
diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl
index cfb2599ae44..c0c03feb143 100644
--- a/third_party/nccl/nccl_configure.bzl
+++ b/third_party/nccl/nccl_configure.bzl
@@ -91,7 +91,12 @@ def _nccl_configure_impl(repository_ctx):
     else:
         # Create target for locally installed NCCL.
         config = find_cuda_config(repository_ctx, ["nccl"])
-        repository_ctx.template("BUILD", _label("system.BUILD.tpl"), config)
+        config_wrap = {
+            "%{nccl_version}": config["nccl_version"],
+            "%{nccl_header_dir}": config["nccl_include_dir"],
+            "%{nccl_library_dir}": config["nccl_library_dir"],
+        }
+        repository_ctx.template("BUILD", _label("system.BUILD.tpl"), config_wrap)
 
 nccl_configure = repository_rule(
     implementation = _nccl_configure_impl,
diff --git a/third_party/tensorrt/tensorrt_configure.bzl b/third_party/tensorrt/tensorrt_configure.bzl
index 3c5550abc9d..2ade5093f41 100644
--- a/third_party/tensorrt/tensorrt_configure.bzl
+++ b/third_party/tensorrt/tensorrt_configure.bzl
@@ -27,6 +27,24 @@ _DEFINE_TENSORRT_SONAME_MAJOR = "#define NV_TENSORRT_SONAME_MAJOR"
 _DEFINE_TENSORRT_SONAME_MINOR = "#define NV_TENSORRT_SONAME_MINOR"
 _DEFINE_TENSORRT_SONAME_PATCH = "#define NV_TENSORRT_SONAME_PATCH"
 
+def _at_least_version(actual_version, required_version):
+    actual = [int(v) for v in actual_version.split(".")]
+    required = [int(v) for v in required_version.split(".")]
+    return actual >= required
+
+def _update_tensorrt_headers(tensorrt_version):
+    if not _at_least_version(tensorrt_version, "6"):
+        return
+    _TF_TENSORRT_HEADERS = [
+        "NvInferVersion.h",
+        "NvInfer.h",
+        "NvUtils.h",
+        "NvInferPlugin.h",
+        "NvInferRTSafe.h",
+        "NvInferRTExt.h",
+        "NvInferPluginUtils.h",
+    ]
+
 def _tpl(repository_ctx, tpl, substitutions):
     repository_ctx.template(
         tpl,
@@ -69,6 +87,7 @@ def _tensorrt_configure_impl(repository_ctx):
     cpu_value = get_cpu_value(repository_ctx)
 
     # Copy the library and header files.
+    _update_tensorrt_headers(trt_version)
     libraries = [lib_name(lib, cpu_value, trt_version) for lib in _TF_TENSORRT_LIBS]
     library_dir = config["tensorrt_library_dir"] + "/"
     headers = _TF_TENSORRT_HEADERS
diff --git a/third_party/toolchains/preconfig/generate/containers.bzl b/third_party/toolchains/preconfig/generate/containers.bzl
index 7f5331994b1..9531e959717 100644
--- a/third_party/toolchains/preconfig/generate/containers.bzl
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@@ -1,6 +1,6 @@
 """SHA 256 values for each image."""
 container_digests = {
-    "ubuntu16.04": "sha256:d0d98c53111c3ec071aa81632a2b0d6f210e5c2411c5172e31f99002125ec4de",
+    "ubuntu16.04": "sha256:b90dcf2f35f3354909f4491bdf019c110b4b4d95ef0395ebf178bc5d523a4208",
     "centos6": "sha256:8402dc2bc0e9baa31a32caf182bf6a4f5f91852d1d5e3079175dfb4d2237cde8",
     "cuda10.0-cudnn7-ubuntu14.04": "sha256:d433e1221f802dac393bc8652fabcc63aa46896cd920bb888ae0e2002fe6b756",
     "cuda10.0-cudnn7-centos7": "sha256:a453b7147a60928a8345689eae48916a746b3578b5e831bfa151f0529d469c88",
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
index 960a38fc055..e9644d02d36 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
+++ b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
@@ -28,37 +28,45 @@ config_setting(
 config_setting(
     name = "darwin",
     values = {"cpu": "darwin"},
-    visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "freebsd",
     values = {"cpu": "freebsd"},
-    visibility = ["//visibility:public"],
 )
 
+# Provides CUDA headers for '#include "third_party/gpus/cuda/include/cuda.h"'
+# All clients including TensorFlow should use these directives.
 cc_library(
-    name = "cuda_headers",
+    name = "cuda_virtual_headers",
     hdrs = [
         "cuda/cuda_config.h",
         ":cuda-include",
-        ":cudnn-include",
     ],
+    include_prefix = "third_party/gpus",
+    visibility = ["//visibility:private"],
+)
+
+# Provides CUDA headers for '#include <cuda.h>'.
+# CUDA itself as well as Eigen use these directives.
+cc_library(
+    name = "cuda_headers",
     includes = [
-        ".",
+        ".",  # required to include cuda/cuda/cuda_config.h as cuda/config.h
         "cuda/include",
-        "cuda/include/crt",
     ],
-    visibility = ["//visibility:public"],
+    textual_hdrs = [
+        # TODO(csigg): change references to third_party/gpus/cuda/cuda_config.h
+        # (e.g. in the PIP build script) and then remove cuda_config.h.
+        "cuda/cuda_config.h",
+        ":cuda-include",
+    ],
+    deps = [":cuda_virtual_headers"],
 )
 
 cc_library(
     name = "cudart_static",
     srcs = ["cuda/lib/libcudart_static.a"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkopts = select({
         ":freebsd": [],
         "//conditions:default": ["-ldl"],
@@ -66,104 +74,82 @@ cc_library(
         "-lpthread",
         "-lrt",
     ],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cuda_driver",
     srcs = ["cuda/lib/libcuda.so"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cudart",
     srcs = ["cuda/lib/libcudart.so.10.0"],
     data = ["cuda/lib/libcudart.so.10.0"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkstatic = 1,
-    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cublas_virtual_headers",
+    hdrs = [":cublas-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    strip_include_prefix = "cublas/include",
+    visibility = ["//visibility:private"],
+    deps = [":cuda_headers"],
+)
+
+cc_library(
+    name = "cublas_headers",
+    includes = ["cublas/include"],
+    textual_hdrs = [":cublas-include"],
+    deps = [":cublas_virtual_headers"],
 )
 
 cc_library(
     name = "cublas",
     srcs = ["cuda/lib/libcublas.so.10.0"],
     data = ["cuda/lib/libcublas.so.10.0"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkstatic = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cusolver",
     srcs = ["cuda/lib/libcusolver.so.10.0"],
     data = ["cuda/lib/libcusolver.so.10.0"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkopts = ["-lgomp"],
     linkstatic = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cudnn",
     srcs = ["cuda/lib/libcudnn.so.7"],
     data = ["cuda/lib/libcudnn.so.7"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkstatic = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cudnn_header",
-    includes = [
-        ".",
-        "cuda/include",
-    ],
-    visibility = ["//visibility:public"],
+    hdrs = [":cudnn-include"],
+    include_prefix = "third_party/gpus/cudnn",
+    strip_include_prefix = "cudnn/include",
+    deps = [":cuda_headers"],
 )
 
 cc_library(
     name = "cufft",
     srcs = ["cuda/lib/libcufft.so.10.0"],
     data = ["cuda/lib/libcufft.so.10.0"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkstatic = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "curand",
     srcs = ["cuda/lib/libcurand.so.10.0"],
     data = ["cuda/lib/libcurand.so.10.0"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkstatic = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cuda",
-    visibility = ["//visibility:public"],
     deps = [
         ":cublas",
         ":cuda_headers",
@@ -174,46 +160,37 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "cupti_virtual_headers",
+    hdrs = [":cuda-extras"],
+    include_prefix = "third_party/gpus",
+    visibility = ["//visibility:private"],
+    deps = [":cuda_headers"],
+)
+
 cc_library(
     name = "cupti_headers",
-    hdrs = [
-        "cuda/cuda_config.h",
-        ":cuda-extras",
-    ],
-    includes = [
-        ".",
-        "cuda/extras/CUPTI/include/",
-    ],
-    visibility = ["//visibility:public"],
+    includes = ["cuda/extras/CUPTI/include/"],
+    textual_hdrs = [":cuda-extras"],
+    deps = [":cupti_virtual_headers"],
 )
 
 cc_library(
     name = "cupti_dsos",
     data = ["cuda/lib/libcupti.so.10.0"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cusparse",
     srcs = ["cuda/lib/libcusparse.so.10.0"],
     data = ["cuda/lib/libcusparse.so.10.0"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
     linkopts = ["-lgomp"],
     linkstatic = 1,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "libdevice_root",
     data = [":cuda-nvvm"],
-    visibility = ["//visibility:public"],
 )
 
 genrule(
@@ -1253,6 +1230,16 @@ genrule(
     cmd = """cp -rLf "/usr/local/cuda-10.0/extras/CUPTI/include/." "$(@D)/cuda/extras/CUPTI/include/" """,
 )
 
+genrule(
+    name = "cublas-include",
+    outs = [
+        "cublas/include/cublas.h",
+        "cublas/include/cublas_v2.h",
+        "cublas/include/cublas_api.h",
+    ],
+    cmd = """cp -f "/usr/local/cuda-10.0/include/cublas.h" $(location cublas/include/cublas.h) && cp -f "/usr/local/cuda-10.0/include/cublas_v2.h" $(location cublas/include/cublas_v2.h) && cp -f "/usr/local/cuda-10.0/include/cublas_api.h" $(location cublas/include/cublas_api.h) """,
+)
+
 genrule(
     name = "cuda-lib",
     outs = [
@@ -1267,7 +1254,7 @@ genrule(
         "cuda/lib/libcupti.so.10.0",
         "cuda/lib/libcusparse.so.10.0",
     ],
-    cmd = """cp -f "/usr/local/cuda-10.0/lib64/stubs/libcuda.so" $(location cuda/lib/libcuda.so) && cp -f "/usr/local/cuda-10.0/lib64/libcudart.so.10.0" $(location cuda/lib/libcudart.so.10.0) && cp -f "/usr/local/cuda-10.0/lib64/libcudart_static.a" $(location cuda/lib/libcudart_static.a) && cp -f "/usr/local/cuda-10.0/lib64/libcublas.so.10.0" $(location cuda/lib/libcublas.so.10.0) && cp -f "/usr/local/cuda-10.0/lib64/libcusolver.so.10.0" $(location cuda/lib/libcusolver.so.10.0) && cp -f "/usr/local/cuda-10.0/lib64/libcusparse.so.10.0" $(location cuda/lib/libcusparse.so.10.0) && cp -f "/usr/local/cuda-10.0/lib64/libcurand.so.10.0" $(location cuda/lib/libcurand.so.10.0) && cp -f "/usr/local/cuda-10.0/lib64/libcufft.so.10.0" $(location cuda/lib/libcufft.so.10.0) && cp -f "/usr/lib/x86_64-linux-gnu/libcudnn.so.7" $(location cuda/lib/libcudnn.so.7) && cp -f "/usr/local/cuda-10.0/extras/CUPTI/lib64/libcupti.so.10.0" $(location cuda/lib/libcupti.so.10.0) """,
+    cmd = """cp -f "/usr/local/cuda-10.0/lib64/stubs/libcuda.so" $(location cuda/lib/libcuda.so) && cp -f "/usr/local/cuda-10.0/lib64/libcudart.so.10.0" $(location cuda/lib/libcudart.so.10.0) && cp -f "/usr/local/cuda-10.0/lib64/libcudart_static.a" $(location cuda/lib/libcudart_static.a) && cp -f "/usr/local/cuda-10.0/lib64/libcublas.so.10.0" $(location cuda/lib/libcublas.so.10.0) && cp -f "/usr/local/cuda-10.0/lib64/libcusolver.so.10.0" $(location cuda/lib/libcusolver.so.10.0) && cp -f "/usr/local/cuda-10.0/lib64/libcurand.so.10.0" $(location cuda/lib/libcurand.so.10.0) && cp -f "/usr/local/cuda-10.0/lib64/libcufft.so.10.0" $(location cuda/lib/libcufft.so.10.0) && cp -f "/usr/lib/x86_64-linux-gnu/libcudnn.so.7" $(location cuda/lib/libcudnn.so.7) && cp -f "/usr/local/cuda-10.0/extras/CUPTI/lib64/libcupti.so.10.0" $(location cuda/lib/libcupti.so.10.0) && cp -f "/usr/local/cuda-10.0/lib64/libcusparse.so.10.0" $(location cuda/lib/libcusparse.so.10.0) """,
 )
 
 genrule(
@@ -1297,7 +1284,7 @@ genrule(
 genrule(
     name = "cudnn-include",
     outs = [
-        "cuda/include/cudnn.h",
+        "cudnn/include/cudnn.h",
     ],
-    cmd = """cp -f "/usr/include/cudnn.h" $(location cuda/include/cudnn.h) """,
+    cmd = """cp -f "/usr/include/cudnn.h" $(location cudnn/include/cudnn.h) """,
 )
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD
index 518a3b017b9..a8a0e57eaa4 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD
+++ b/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD
@@ -3,12 +3,12 @@
 
 licenses(["notice"])
 
-exports_files(["LICENSE"])
-
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
 
 package(default_visibility = ["//visibility:public"])
 
+exports_files(["LICENSE"])
+
 cc_library(
     name = "tensorrt_headers",
     hdrs = [":tensorrt_include"],
@@ -18,15 +18,9 @@ cc_library(
 
 cc_library(
     name = "tensorrt",
-    srcs = [
-        "tensorrt/lib/libnvinfer.so.5",
-        "tensorrt/lib/libnvinfer_plugin.so.5",
-    ],
+    srcs = [":tensorrt_lib"],
     copts = cuda_default_copts(),
-    data = [
-        "tensorrt/lib/libnvinfer.so.5",
-        "tensorrt/lib/libnvinfer_plugin.so.5",
-    ],
+    data = [":tensorrt_lib"],
     include_prefix = "",
     linkstatic = 1,
     visibility = ["//visibility:public"],